Exemplo n.º 1
0
 def find_new_invalid_svc(self, invalid_history_file_path):
     # Use dell support URL to check those unknown even after invalid history
     valid_set = set([])
     max_retry = 3  # retry 3 times at most
     new_invalid_count = 0
     for svc in self.target_svc_set:
         for i in xrange(max_retry):
             try:
                 if i == max_retry - 1:
                     time.sleep(1)  # last time retry, sleep for 1 second
                 if SVCGenerator.check_svc_valid(svc):
                     valid_set.add(svc)
                 else:
                     new_invalid_count += 1
                     self.logger.info("不合法:%s " % svc)
                     # Append new invalid SVC into history file if provided
                     FileUtil.save_object_to_path(svc,
                                                  invalid_history_file_path,
                                                  append=True)
                 break
             except requests.exceptions.ConnectionError:
                 # if ConnectionError, pass
                 continue
         else:
             self.logger.warn("检查查询码超时%s,忽略" % svc)
     self.target_svc_set = valid_set
     self.logger.info("新增%s个不合法的查询码" % new_invalid_count)
Exemplo n.º 2
0
 def find_new_invalid_svc(self, invalid_history_file_path):
     # Use dell support URL to check those unknown even after invalid history
     valid_set = set([])
     max_retry = 3 # retry 3 times at most
     new_invalid_count = 0
     for svc in self.target_svc_set:
         for i in xrange(max_retry):
             try:
                 if i == max_retry - 1:
                     time.sleep(1) # last time retry, sleep for 1 second
                 if SVCGenerator.check_svc_valid(svc):
                     valid_set.add(svc)
                 else:
                     new_invalid_count += 1
                     self.logger.info("不合法:%s " % svc)
                     # Append new invalid SVC into history file if provided
                     FileUtil.save_object_to_path(svc, invalid_history_file_path, append=True)
                 break
             except requests.exceptions.ConnectionError:
                 # if ConnectionError, pass
                 continue
         else:
             self.logger.warn("检查查询码超时%s,忽略" % svc)
     self.target_svc_set = valid_set
     self.logger.info("新增%s个不合法的查询码" % new_invalid_count)
    def _run(self,
             final_thresholds,
             maj_thresholds,
             matrix_file_path=None,
             artifact_map_file_path=None):
        if not matrix_file_path:
            matrix_file_path = self.default_matrix_path()
        if not artifact_map_file_path:
            artifact_map_file_path = self._default_a2eMap_path()
        if not FileUtil.file_exists(matrix_file_path):
            log.error(
                f"File does not exists: {matrix_file_path}\n"
                f"Please pass a valid file path or call {self.__class__.__name__}().precalculate() first"
            )
        if not FileUtil.file_exists(artifact_map_file_path):
            log.error(
                f"File does not exists: {artifact_map_file_path}\n"
                f"Please pass a valid file path or call {self.__class__.__name__}().precalculate() first"
            )

        trace_link_data_structure = ElementLevelTraceLinkDataStructure.load_data_from(
            matrix_file_path, artifact_map_file_path)
        trace_link_processor = MajProcessor(trace_link_data_structure,
                                            self.similarity_filter,
                                            self.req_reduce_func,
                                            self.code_reduce_function,
                                            final_thresholds, maj_thresholds,
                                            self.callgraph_aggregator)
        return trace_link_processor.run()
 def process_trace_link_2D_dict(self, trace_link_2D_dict: Dict[float, Dict[float, List[TraceLink]]]):
     print_str_dict, best_eval_result, best_final_threshold, best_maj_thresh = self._process_trace_link_2D_dict(trace_link_2D_dict)
     
     header_row = [""]  # First header cell is empty -> needed for header column
     header_row += [self.FILE_LEVEL_DROP_THRESH_PATTERN.format(final_threshold) for final_threshold in print_str_dict[best_maj_thresh].keys()]
     
     excel_array = [header_row]
     for maj_thresh in sorted(print_str_dict):
         next_row = [self.MAJ_DROP_THRESH_PATTERN.format(maj_thresh)]  # First cell is the maj thresh, followed by the evaluated f1 metrics for this maj thresh
         
         for final_threshold in sorted(print_str_dict[maj_thresh]):
             next_row.append(print_str_dict[maj_thresh][final_threshold])
             
             if self._also_print_eval:
                 log.info(f"\nm{maj_thresh} f{final_threshold}\n"
                          f"{next_row[-1]}")
                 
         excel_array.append(next_row)
         
     excel_array.append([""])  # Add empty row as divider
     if isinstance(best_eval_result, F1ResultObject):
         excel_array = self._add_best_f1_2D_excel_rows(excel_array, print_str_dict, best_eval_result, best_final_threshold, best_maj_thresh)
     else:
         excel_array.append([self.NO_BEST_F1_MESSAGE])
         
     FileUtil.write_eval_to_excel(excel_array, self._excel_output_file_path)
 def __init__(self, lemmatizer_type=LemmatizerType.english_nltk):
     self._lemmatizer_type = lemmatizer_type
     self._lemmatizer = None
     if lemmatizer_type == self.LemmatizerType.english_nltk:
         self._lemmatizer = WordNetLemmatizer()
     elif lemmatizer_type == self.LemmatizerType.english_spacy:
         # Use precalculated files for spacy since free google colab can't handle fasttext model and spacy lemmatizer at once
         if not FileUtil.file_exists(PRECALCULATED_SPACY_ENGLISH_LEMMA_CSV):
             log.error(
                 f"{PRECALCULATED_SPACY_ENGLISH_LEMMA_CSV} does not exists. The spacy lemmatizer needs an precalculated lemma file."
             )
         self._lemmatizer = PandasUtil.read_csv_to_dataframe(
             PRECALCULATED_SPACY_ENGLISH_LEMMA_CSV)
     elif lemmatizer_type == self.LemmatizerType.italian_nltk:
         self._lemmatizer = SnowballStemmer("italian")
     elif lemmatizer_type == self.LemmatizerType.italian_spacy:
         # Use precalculated files for spacy since free google colab can't handle fasttext model and spacy lemmatizer at once
         if not FileUtil.file_exists(PRECALCULATED_SPACY_ITALIAN_LEMMA_CSV):
             log.error(
                 f"{PRECALCULATED_SPACY_ITALIAN_LEMMA_CSV} does not exists. The spacy lemmatizer needs an precalculated lemma file."
             )
         self._lemmatizer = PandasUtil.read_csv_to_dataframe(
             PRECALCULATED_SPACY_ITALIAN_LEMMA_CSV)
     else:
         log.error(f"Unknown case for LemmatizerType: {lemmatizer_type}")
Exemplo n.º 6
0
 def tokenize_all_sentences_in_directory(self, directory) -> [str]:
     sentences = []
     for file in FileUtil.get_files_in_directory(directory):
         if self._italian:
             sentences += sent_tokenize(FileUtil.read_textfile_into_string(file, self._dataset.encoding()), language="italian")
         else: 
             sentences += sent_tokenize(FileUtil.read_textfile_into_string(file, self._dataset.encoding()))
     return sentences
    def __init__(self, ital=False):

        if ital:
            stopwords_as_string = FileUtil.read_textfile_into_string(
                ITAL_CODE_STOPWORD_FILEPATH)
        else:
            stopwords_as_string = FileUtil.read_textfile_into_string(
                CODE_STOPWORD_FILEPATH)
        self._stop_words = stopwords_as_string.split("\n")
Exemplo n.º 8
0
 def serialize_txt_batch(dell_asset_list, output_dir):
     # Serialize each dell asset as one text file under the output directory
     # Return all the output file names under the dir in a list for reference
     output_path_list = list([])
     for da in dell_asset_list:
         file_name = "%s.txt" % da.svc_tag
         output_path_list.append(os.path.join(output_dir, file_name))
         FileUtil.save_object_to_path(da.serialize_txt(), output_path_list[-1])
     return output_path_list
Exemplo n.º 9
0
 def _tokenize_and_preprocess(self, file_path):
     log.debug(f"Tokenizing {file_path}")
     file_representation = self._tokenizer.tokenize(file_path)
     log.debug(f"preprocessing {file_path}")
     file_representation.preprocess(self._preprocessor)
     if self._preprocessed_token_output_directory:
         FileUtil.write_file(self._preprocessed_token_output_directory / (PREPROCESSED_TOKEN_FILENAME_PREFIX 
             +FileUtil.get_filename_from_path(file_path)), file_representation.get_printable_string())
     return file_representation
Exemplo n.º 10
0
 def write_to(self, file_path):
     json_to_write = {
         self.REQ_FILE_TO_REQ_ELEMENT_ID_MAP:
         self._req_file_to_req_element_id_map,
         self.CODE_FILE_TO_METHOD_MAP:
         self._code_file_to_method_map,
         self.CODE_FILE_TO_NON_CG_ELEMENT_MAP:
         self._code_file_to_non_cg_element_map
     }
     FileUtil.write_to_json(file_path, json_to_write)
Exemplo n.º 11
0
 def serialize_txt_batch(dell_asset_list, output_dir):
     # Serialize each dell asset as one text file under the output directory
     # Return all the output file names under the dir in a list for reference
     output_path_list = list([])
     for da in dell_asset_list:
         file_name = "%s.txt" % da.svc_tag
         output_path_list.append(os.path.join(output_dir, file_name))
         FileUtil.save_object_to_path(da.serialize_txt(),
                                      output_path_list[-1])
     return output_path_list
Exemplo n.º 12
0
 def create_all_embeddings(self, input_directory, output_emb_filepath=None) -> [EmbeddingContainer]:
     """
     Creates embeddings for all files in the input directory.
     Writes all embeddings in a file at output_emb_filepath if not None.
     Returns the embeddings as list
     """
     log.info("Read directory: " + str(input_directory))
     embedding_list = self.embedd_all_files_in_directory(input_directory)
     if output_emb_filepath is not None:
         FileUtil.write_file(output_emb_filepath, "\n".join(map(str, embedding_list)))
     return embedding_list
Exemplo n.º 13
0
def main(svc_input, configs):
    logger = Logger("查询日志", verbose=True)
    log_file_name = "log%s_%s.txt" % (svc_input.replace("?", "#"), DateTimeUtil.get_current_datetime(is_date=True))
    log_file_path = WindowsUtil.convert_win_path(os.path.join(temp_dir, log_file_name))
    logger.info("[开始查询] %s" % svc_input)
    try:
        # 找到本地匹配的保修历史记录
        history_zip = ZipFileSVC(zip_file_path=history_zipfile, mode='a')
        start_time = DateTimeUtil.get_current_datetime()
        # 创建出所有可能查询码
        svc_generator = SVCGenerator(svc_input, logger)
        logger.info("创建出所有可能查询码:%s" % len(svc_generator.target_svc_set))
        # 根据本地匹配的非法查询码历史,筛选出目标查询码,以及非法查询码
        existed_svc = history_zip.find_file_regex(svc_generator.regex)
        svc_generator.generate_target_svc_batch(existed_svc, invalid_history_file_path)
        # 调用戴尔查询API,并将API数据转化为实体类数据
        output_dell_asset_list = list([])
        if svc_generator.target_svc_set:
            batch = Batch(logger, configs)
            api_dell_asset_list = batch.begin(svc_generator.target_svc_set)
            output_dell_asset_list = api_dell_asset_list
            logger.info("从API中总共得到%s个结果" % (len(api_dell_asset_list)))
            logger.info("将实体类序列化到本地临时TXT文件")
            temp_text_files_path = DellAsset.serialize_txt_batch(api_dell_asset_list, temp_dir)
            logger.info("将序列化临时文件存到本地zip历史记录,总数:%s" % len(temp_text_files_path))
            history_zip.add_new_file_batch(temp_text_files_path)
            logger.info("删除临时 %s 个TXT文件" % len(temp_text_files_path))
            for file_path in temp_text_files_path:
                FileUtil.delete_file(file_path)
            logger.info("将API得到的实体类和历史记录实体类合并")
        else:
            logger.warn("目标查询码为空,仅从从历史记录中导出结果")
        for svc in svc_generator.existed_svc_set:
            dell_asset_content = history_zip.get_member_content(file_name="%s.txt" % svc)
            output_dell_asset_list.append(DellAsset.deserialize_txt(dell_asset_content))
        logger.info("添加历史记录,总共得到%s个结果" % (len(output_dell_asset_list)))
        excel_output_path = WindowsUtil.convert_win_path(os.path.join(excel_dir, "%s.xlsx" % svc_generator.get_file_name()))
        DellAsset.save_as_excel_batch(output_dell_asset_list, excel_output_path)
        if FileUtil.is_path_existed(excel_output_path):
            logger.info("存为Excel文档成功")
            end_time = DateTimeUtil.get_current_datetime()
            logger.info("总用时 %s " % DateTimeUtil.datetime_diff(start_time, end_time))
            logger.info("[查询结束] 总共%s个结果 保存在:%s" % (len(output_dell_asset_list), excel_output_path))
        else:
            logger.error("[保存结果失败] %s" % excel_output_path)
    except Exception as e:
        # 若程序出现错误失败,发送邮件
        logger.error("[查询失败] 已发送报告 请等待解决")
        logger.error("%s\n%s" % (e, traceback.format_exc()))
        logger.save(log_file_path)
        email_api_key = configs["email_api_key"]
        email = Email(email_api_key, subject="[查询失败] %s %s" % (DateTimeUtil.get_current_datetime(is_date=True), svc_input))
        email.add_attachment(log_file_path)
        email.send(cc_mode=logger.has_error)
Exemplo n.º 14
0
 def tokenize(self, file_path):
     text_as_string = FileUtil.read_textfile_into_string(file_path, self._dataset.encoding())
     if self._italian:
         tokenized_text = " ".join(text_as_string.split("'"))
         return TextFileRepresentation(word_tokenize(tokenized_text), file_path)
     else:
         return TextFileRepresentation(word_tokenize(text_as_string), file_path)
Exemplo n.º 15
0
 def add_new_file_batch(self, file_path_list):
     # duplicated files are allowed, so be careful
     for file_path in file_path_list:
         if not FileUtil.is_path_existed(file_path):
             continue
         file_name = os.path.split(file_path)[-1]
         if len(file_name) >= 7:
             self.file.write(filename=file_path, arcname=file_name)
Exemplo n.º 16
0
def convert_comet_to_recall_prec_csv(file_path, dataset, drop_threshs,
                                     output_file_name):
    """
    Creates a csv file with recall/precision pairs that are generated by applying the thresholds.
    e. g. drop_threshs = [0, 0.01, 0.02, ..., 1]
    The csv file can be used to illustrate a recall/precision graph in LaTex.
    """
    trace_links = _extract_comet_trace_links(file_path)
    eval_result_list = _eval_comet_data_multiple_thresh(
        trace_links, dataset, drop_threshs)
    recall_prec_dict = {}
    for eval_result_object, _ in eval_result_list:
        if isinstance(eval_result_object, F1ResultObject):
            recall_prec_dict[
                eval_result_object.recall] = eval_result_object.precision

    FileUtil.write_recall_precision_csv(recall_prec_dict, output_file_name)
Exemplo n.º 17
0
 def add_new_file_batch(self, file_path_list):
     # duplicated files are allowed, so be careful
     for file_path in file_path_list:
         if not FileUtil.is_path_existed(file_path):
             continue
         file_name = os.path.split(file_path)[-1]
         if len(file_name) >= 7:
             self.file.write(filename=file_path, arcname=file_name)
Exemplo n.º 18
0
 def tokenize(self, file_path) -> FileRepresentation:
     text_as_string = FileUtil.read_textfile_into_string(file_path, self._dataset.encoding())
     word_tokenized_sentences = None
     if self._italian:
         tokens = [word_tokenize(" ".join(sent.split("'")), language="italian") for sent in sent_tokenize(text_as_string, language="italian")]
         return TextFileGroupedRepresentation(tokens, file_path) 
     else:
         word_tokenized_sentences = [word_tokenize(sent) for sent in sent_tokenize(text_as_string)]
     return TextFileGroupedRepresentation(word_tokenized_sentences, file_path)
Exemplo n.º 19
0
 def tokenize(self, file_path) -> FileRepresentation:
     text_as_string = FileUtil.read_textfile_into_string(file_path, self._dataset.encoding())
     grp = re.search(self.JAVADOC_TAGS, text_as_string, re.RegexFlag.IGNORECASE)
     if grp:
         matched_tag = grp[0]
         substring_index = text_as_string.find(matched_tag)
         text_as_string = text_as_string[:substring_index]
         text_as_string = super(JavaDocDescriptionOnlyTokenizer, self).tokenize_to_string_list(text_as_string)
     return TextFileRepresentation(text_as_string, file_path)
    def process_trace_link_dict(self, trace_link_dict: Dict[float, List[TraceLink]]):
        print_str_dict, best_eval_result, best_thresh = self._process_trace_link_dict(trace_link_dict)
        header_row = []  # Contains thresholds
        value_row = []  # Contains evaluated f1 metrics
        for final_threshold in sorted(print_str_dict.keys()):
            header_row.append(self.FILE_LEVEL_DROP_THRESH_PATTERN.format(final_threshold))
            value_row.append(print_str_dict[final_threshold])
            
            if self._also_print_eval:
                log.info(f"\nf{final_threshold}\n"
                         f"{value_row[-1]}")
        
        excel_array = [header_row] + [value_row]
        excel_array.append([""])  # Add empty row as divider
        if isinstance(best_eval_result, F1ResultObject):
            excel_array = self._add_best_f1_excel_rows(excel_array, print_str_dict, best_eval_result, best_thresh)
        else:
            excel_array.append([self.NO_BEST_F1_MESSAGE])

        FileUtil.write_eval_to_excel(excel_array, self._excel_output_file_path)
Exemplo n.º 21
0
def _extract_comet_trace_links(file_path):
    lines = FileUtil.read_textfile_into_lines_list(file_path)
    lines = lines[6:]  # first 6 lines contain no similarity data
    trace_links = []
    for line in lines:
        req, code, sim = line.split(" ")
        code = _remove_package_prefix(code)
        if code.endswith(".jsp") or code.endswith(".txt"):
            continue
        sim = float(sim)
        trace_links.append(TraceLink(req, code, sim))
    return trace_links
Exemplo n.º 22
0
 def filter_invalid_history(self, invalid_history_file_path):
     # If invalid history provided, remove those invalid from target svc set
     if FileUtil.is_path_existed(invalid_history_file_path):
         with open(invalid_history_file_path, mode='r') as invalid_history_file:
             # read the history file line by line, in case file too large
             for svc in invalid_history_file:
                 svc = svc.replace("\n", "")
                 if len(svc) == 7 and svc in self.target_svc_set:
                     self.invalid_history_count += 1
                     self.target_svc_set.remove(svc)
                 if not self.target_svc_set:
                     break
     self.logger.info("已知的本地非法查询码历史:%s" % self.invalid_history_count)
Exemplo n.º 23
0
 def filter_invalid_history(self, invalid_history_file_path):
     # If invalid history provided, remove those invalid from target svc set
     if FileUtil.is_path_existed(invalid_history_file_path):
         with open(invalid_history_file_path,
                   mode='r') as invalid_history_file:
             # read the history file line by line, in case file too large
             for svc in invalid_history_file:
                 svc = svc.replace("\n", "")
                 if len(svc) == 7 and svc in self.target_svc_set:
                     self.invalid_history_count += 1
                     self.target_svc_set.remove(svc)
                 if not self.target_svc_set:
                     break
     self.logger.info("已知的本地非法查询码历史:%s" % self.invalid_history_count)
 def iterate_files(tokenizer, preprecessor, folder):
     for file in FileUtil.get_files_in_directory(folder, True):
         file_representation = tokenizer.tokenize(file)
         file_representation.preprocess(preprecessor)
         for word in file_representation.token_list:
             lemma = [token.lemma_ for token in spacy_lemmatizer(word)]
             if len(lemma) > 1:
                 log.info(
                     f"More than one lemma {lemma} for \"{word}\". Using \"{''.join(lemma)}\" as lemma"
                 )
             lemma = "".join(lemma)
             if word in word_to_lemma_map:
                 if not word_to_lemma_map[word] == lemma:
                     log.info(
                         f"Different Duplicate Lemma for {word}: {word_to_lemma_dataframe[word]} <-> {lemma}"
                     )
             else:
                 word_to_lemma_map[word] = lemma
Exemplo n.º 25
0
class Warranty(object):
    translation = FileUtil.read_file(file_path=translation_url,
                                     isYML=True,
                                     isURL=True)

    def __init__(self,
                 service_en,
                 start_date,
                 end_date,
                 provider,
                 service_ch=None):
        self.start_date = DateTimeUtil.parse_str_date(start_date)
        self.end_date = DateTimeUtil.parse_str_date(end_date)
        self.service_en = str(service_en)
        self.service_en.replace(",", " ")
        self.provider = provider
        self.service_ch = service_ch
        if not service_ch or service_ch == "?":
            self.service_ch = Warranty.translation.get(service_en, "?")

    def to_excel_data(self):
        return [
            self.service_ch, self.service_en, self.start_date, self.end_date,
            self.provider
        ]

    def __repr__(self):
        return "%s,%s,%s,%s,%s" % (self.service_ch, self.service_en,
                                   self.start_date, self.end_date,
                                   self.provider)

    @staticmethod
    def deserialize_txt(warranty_line):
        if warranty_line:
            items = warranty_line.split(",")
            if len(items) >= 5:
                if items[1] or items[0]:
                    return Warranty(service_ch=items[0],
                                    service_en=items[1],
                                    start_date=items[2],
                                    end_date=items[3],
                                    provider=items[4])
        return None
Exemplo n.º 26
0
 def embedd_all_files_in_directory(self, directory):
     all_filenames = FileUtil.get_files_in_directory(directory)
     all_embeddings = []
     for filename in all_filenames:
         try:
             file_representation = self._tokenize_and_preprocess(filename)
         except (FileNotFoundError, IsADirectoryError, PermissionError, UnicodeDecodeError,) as e:
             log.info(f"SKIPPED: Error on reading or tokenizing {filename}: {e}")
             continue
         except JavaSyntaxError as j:
             log.info(f"SKIPPED: JavaSyntaxError on tokenizing {filename} (Note: code files needs to be compilable): {j.at}")
             continue
         except (JavaParserError, LexerError) as j:
             log.info(f"SKIPPED: Error on tokenizing {filename} (Note: code files needs to be compilable): {j}")
             continue
         file_embedding = self._create_embeddings(file_representation)
         if file_embedding:
             all_embeddings.append(file_embedding)
         else:
             log.info(f"No embedding for {filename}")
     return all_embeddings
Exemplo n.º 27
0
 def load_from(cls, file_path):
     loaded_json = FileUtil.read_from_json(file_path)
     return cls(loaded_json[cls.REQ_FILE_TO_REQ_ELEMENT_ID_MAP],
                loaded_json[cls.CODE_FILE_TO_METHOD_MAP],
                loaded_json[cls.CODE_FILE_TO_NON_CG_ELEMENT_MAP])
Exemplo n.º 28
0
def main(svc_input, configs):
    logger = Logger("查询日志", verbose=True)
    log_file_name = "log%s_%s.txt" % (svc_input.replace(
        "?", "#"), DateTimeUtil.get_current_datetime(is_date=True))
    log_file_path = WindowsUtil.convert_win_path(
        os.path.join(temp_dir, log_file_name))
    logger.info("[开始查询] %s" % svc_input)
    try:
        # 找到本地匹配的保修历史记录
        history_zip = ZipFileSVC(zip_file_path=history_zipfile, mode='a')
        start_time = DateTimeUtil.get_current_datetime()
        # 创建出所有可能查询码
        svc_generator = SVCGenerator(svc_input, logger)
        logger.info("创建出所有可能查询码:%s" % len(svc_generator.target_svc_set))
        # 根据本地匹配的非法查询码历史,筛选出目标查询码,以及非法查询码
        existed_svc = history_zip.find_file_regex(svc_generator.regex)
        svc_generator.generate_target_svc_batch(existed_svc,
                                                invalid_history_file_path)
        # 调用戴尔查询API,并将API数据转化为实体类数据
        output_dell_asset_list = list([])
        if svc_generator.target_svc_set:
            batch = Batch(logger, configs)
            api_dell_asset_list = batch.begin(svc_generator.target_svc_set)
            output_dell_asset_list = api_dell_asset_list
            logger.info("从API中总共得到%s个结果" % (len(api_dell_asset_list)))
            logger.info("将实体类序列化到本地临时TXT文件")
            temp_text_files_path = DellAsset.serialize_txt_batch(
                api_dell_asset_list, temp_dir)
            logger.info("将序列化临时文件存到本地zip历史记录,总数:%s" %
                        len(temp_text_files_path))
            history_zip.add_new_file_batch(temp_text_files_path)
            logger.info("删除临时 %s 个TXT文件" % len(temp_text_files_path))
            for file_path in temp_text_files_path:
                FileUtil.delete_file(file_path)
            logger.info("将API得到的实体类和历史记录实体类合并")
        else:
            logger.warn("目标查询码为空,仅从从历史记录中导出结果")
        for svc in svc_generator.existed_svc_set:
            dell_asset_content = history_zip.get_member_content(
                file_name="%s.txt" % svc)
            output_dell_asset_list.append(
                DellAsset.deserialize_txt(dell_asset_content))
        logger.info("添加历史记录,总共得到%s个结果" % (len(output_dell_asset_list)))
        excel_output_path = WindowsUtil.convert_win_path(
            os.path.join(excel_dir, "%s.xlsx" % svc_generator.get_file_name()))
        DellAsset.save_as_excel_batch(output_dell_asset_list,
                                      excel_output_path)
        if FileUtil.is_path_existed(excel_output_path):
            logger.info("存为Excel文档成功")
            end_time = DateTimeUtil.get_current_datetime()
            logger.info("总用时 %s " %
                        DateTimeUtil.datetime_diff(start_time, end_time))
            logger.info("[查询结束] 总共%s个结果 保存在:%s" %
                        (len(output_dell_asset_list), excel_output_path))
        else:
            logger.error("[保存结果失败] %s" % excel_output_path)
    except Exception as e:
        # 若程序出现错误失败,发送邮件
        logger.error("[查询失败] 已发送报告 请等待解决")
        logger.error("%s\n%s" % (e, traceback.format_exc()))
        logger.save(log_file_path)
        email_api_key = configs["email_api_key"]
        email = Email(
            email_api_key,
            subject="[查询失败] %s %s" %
            (DateTimeUtil.get_current_datetime(is_date=True), svc_input))
        email.add_attachment(log_file_path)
        email.send(cc_mode=logger.has_error)
def create_callgraph_from_raw_file(dataset: Dataset,
                                   create_class_callgraph=False):
    """
    Extract class and method call graph from a raw call graph file generated by the java call graph tool
    The inout raw call graph file is automatically retrieved from dataset.raw_call_graph_path()
    Saves the call graphs as json files at dataset.method_callgraph_path() and dataset.class_callgraph_path()
    
    resulting class call graph:
    dict["classname"] = dict{
                            called_by=[str]
                            calls=[str]
                        }
    
    
    
    resulting method call graph:
    dict["classname.methodname(paramtyp1,paramtyp2)"] = dict{
                                                            called_by=[classname.methodname(paramtyp1,paramtyp2),...]
                                                            calls=[classname.methodname(paramtyp1,paramtyp2),...]
                                                            class_name=str
                                                            method_name=str
                                                            params=[str]
                                                            }
    }
        
    """
    raw_txt_path = dataset.raw_call_graph_path()
    output_class_callgraph = dataset.class_callgraph_path()
    output_method_callgraph = dataset.method_callgraph_path()
    text_rows = []
    try:
        file = open(raw_txt_path, 'r', encoding='utf8')
        text_rows = file.readlines()
    except IOError:
        log.error("Unable to read " + str(raw_txt_path))

    class_call_graph = dict()
    method_call_graph = dict()

    def insert_class(class_name, calls=set(), called_by=set()):

        if class_name in class_call_graph:
            class_call_graph[class_name][CALLS] |= calls
            class_call_graph[class_name][CALLED_BY] |= called_by
        else:
            class_ref = dict()
            class_ref[CALLED_BY] = called_by
            class_ref[CALLS] = calls
            class_call_graph[class_name] = class_ref

    def insert_entry(dict_key,
                     class_name,
                     method_name,
                     param_list,
                     called_by=set(),
                     calls=set()):
        if dict_key in method_call_graph:
            method_call_graph[dict_key][CALLS] |= calls
            method_call_graph[dict_key][CALLED_BY] |= called_by
        else:
            method_dict = dict()
            method_dict[CALLS] = calls
            method_dict[CALLED_BY] = called_by
            method_dict[CLASS_NAME] = class_name
            method_dict[METHOD_NAME] = method_name
            method_dict[PARAMS] = param_list

            method_call_graph[dict_key] = method_dict

    def remove_external_calls():
        for dict_key in method_call_graph:
            method_call_graph[dict_key][CALLS] = [
                callee for callee in method_call_graph[dict_key][CALLS]
                if callee in method_call_graph
            ]
            method_call_graph[dict_key][CALLED_BY] = [
                caller for caller in method_call_graph[dict_key][CALLED_BY]
                if caller in method_call_graph
            ]

    for row in text_rows:
        row_split = row.split(":")
        if row_split[0] == "C":  # Class level call
            classes = row_split[1].split(" ")
            class_1 = _clean(classes[0])
            class_2 = _clean(classes[1])
            if _is_external_class(dataset, class_1) or _is_external_class(
                    dataset, class_2):
                continue
            caller_class_name = _extract_name(classes[0])
            callee_class_name = _extract_name(classes[1].replace('\r',
                                                                 '').replace(
                                                                     '\n', ''))
            if caller_class_name == callee_class_name:
                continue
            if "$" in caller_class_name or "$" in callee_class_name:
                continue  # Leave out inner classes

            if create_class_callgraph:
                insert_class(caller_class_name, set([callee_class_name]),
                             set())
                insert_class(callee_class_name, set(),
                             set([caller_class_name]))

        elif row_split[0] == "M":  # method level call
            # row_split[1] = Class of caller method
            # row_split[2] = caller method<whitespace>calltype and class of callee method
            # row_split[3] = callee method

            split_2 = row_split[2].split(" ")
            split_3 = split_2[1].split(")")
            if _is_external_class(dataset, row_split[1]) or _is_external_class(
                    dataset, split_3[1]):
                continue
            caller_method = split_2[0]
            callee_method = row_split[3]
            if _is_constructor(caller_method) or _is_constructor(
                    callee_method):
                continue
            if _is_access(caller_method) or _is_access(callee_method):
                continue
            caller_class = _extract_name(row_split[1])
            callee_class = _extract_name(split_3[1])
            if "$" in caller_class or "$" in callee_class:
                continue  # Leave out references to inner classes
            # call_type = split_3[0][1]
            split_4 = caller_method.split("(")
            caller_name = split_4[0]
            caller_param = []
            if not split_4[1].startswith(")"):  # params existing
                caller_param = _split_param(
                    split_4[1][:-1])  # Leave out last character, which is a )

            split_5 = callee_method.split("(")
            callee_name = split_5[0]
            callee_param = []
            if not split_5[1].startswith(")"):  # params existing
                callee_param = _split_param(split_5[1].replace(
                    '\r', '').replace(
                        '\n', '')[:-1])  # Leave out last character, which is )

            caller_dict_key = build_class_method_param_dict_key(
                caller_class, caller_name, caller_param)
            callee_dict_key = build_class_method_param_dict_key(
                callee_class, callee_name, callee_param)
            # called_by = caller_dict_key
            # calls = callee_dict_key

            insert_entry(caller_dict_key, caller_class, caller_name,
                         caller_param, set(), set([callee_dict_key]))
            insert_entry(callee_dict_key, callee_class, callee_name,
                         callee_param, set([caller_dict_key]), set())

        else:
            log.error("Unknow start character: " + row_split[0])

    remove_external_calls()

    # convert all sets to lists since set is not json serializable
    if create_class_callgraph:
        for entry in class_call_graph:
            class_call_graph[entry][CALLS] = list(
                class_call_graph[entry][CALLS])
            class_call_graph[entry][CALLED_BY] = list(
                class_call_graph[entry][CALLED_BY])
        FileUtil.write_to_json(output_class_callgraph, class_call_graph)

    for entry in method_call_graph:
        method_call_graph[entry][CALLS] = list(method_call_graph[entry][CALLS])
        method_call_graph[entry][CALLED_BY] = list(
            method_call_graph[entry][CALLED_BY])
    FileUtil.write_to_json(output_method_callgraph, method_call_graph)
Exemplo n.º 30
0
            email_api_key,
            subject="[查询失败] %s %s" %
            (DateTimeUtil.get_current_datetime(is_date=True), svc_input))
        email.add_attachment(log_file_path)
        email.send(cc_mode=logger.has_error)


if __name__ == '__main__':
    while True:
        print "请输入7位查询码,未知位用?代替,比如ABCEF??(符号为英文符号)"
        required_file_path = [
            history_zipfile, config_yml_path, invalid_history_file_path
        ]
        start = True
        for f in required_file_path:
            if not FileUtil.is_path_existed(f):
                print "请把程序运行文件放到程序运行文件夹下"
                start = False
                break
        if start:
            line = sys.stdin.readline()
            svc_input = line.split()[0]
            configs = FileUtil.read_file(config_yml_path, isYML=True)
            if len(svc_input) != 7:
                print "需要7位查询码"
            elif configs is None:
                print "请把正确的配置文件放到程序运行文件夹下"
            else:
                wild_card_count = 0
                for w in svc_input:
                    if w not in letters:
 def __init__(self, file_path, file_vector=None):
     self.file_path = file_path
     self.file_vector = file_vector
     self.file_name = FileUtil.get_filename_from_path(self.file_path)
Exemplo n.º 32
0
        logger.error("[查询失败] 已发送报告 请等待解决")
        logger.error("%s\n%s" % (e, traceback.format_exc()))
        logger.save(log_file_path)
        email_api_key = configs["email_api_key"]
        email = Email(email_api_key, subject="[查询失败] %s %s" % (DateTimeUtil.get_current_datetime(is_date=True), svc_input))
        email.add_attachment(log_file_path)
        email.send(cc_mode=logger.has_error)


if __name__ == '__main__':
    while True:
        print "请输入7位查询码,未知位用?代替,比如ABCEF??(符号为英文符号)"
        required_file_path = [history_zipfile, config_yml_path, invalid_history_file_path]
        start = True
        for f in required_file_path:
            if not FileUtil.is_path_existed(f):
                print "请把程序运行文件放到程序运行文件夹下"
                start = False
                break
        if start:
            line = sys.stdin.readline()
            svc_input = line.split()[0]
            configs = FileUtil.read_file(config_yml_path, isYML=True)
            if len(svc_input) != 7:
                print "需要7位查询码"
            elif configs is None:
                print "请把正确的配置文件放到程序运行文件夹下"
            else:
                wild_card_count = 0
                for w in svc_input:
                    if w not in letters:
Exemplo n.º 33
0
 def __init__(self, file_path):
     self.file_path = file_path
     self.file_name = FileUtil.get_filename_from_path(file_path)
Exemplo n.º 34
0
    def tokenize(self, file_path):
        text_lines = FileUtil.read_textfile_into_lines_list(file_path, self._dataset.encoding())
        
        uc_name_words = []
        uc_actor_words = []
        uc_precond_words = []
        uc_postcond_words = []
        uc_description_words = []
        uc_quality_req_words = []
        uc_flow_of_events_words = []
        last_word_category = uc_description_words  # Default

        for line in text_lines:
            line = line.lstrip()  # Remove leading white spaces/tabs
            if self._dataset.UC_NAME_TEMPLATE_REGEX.match(line):
                matched_string = self._dataset.UC_NAME_TEMPLATE_REGEX.match(line).group(0)
                uc_name_words += self.tokenize_to_string_list(line[len(matched_string):])
                last_word_category = uc_name_words
            elif self._dataset.UC_DESCRIPTION_TEMPLATE_REGEX.match(line):
                matched_string = self._dataset.UC_DESCRIPTION_TEMPLATE_REGEX.match(line).group(0)
                uc_description_words += self.tokenize_to_string_list(line[len(matched_string):])
                last_word_category = uc_description_words
            elif self._dataset.UC_ACTOR_TEMPLATE_REGEX.match(line):
                matched_string = self._dataset.UC_ACTOR_TEMPLATE_REGEX.match(line).group(0)
                uc_actor_words += self.tokenize_to_string_list(line[len(matched_string):])
                last_word_category = uc_actor_words
            elif self._dataset.UC_PRECONDITION_TEMPLATE_REGEX.match(line):
                matched_string = self._dataset.UC_PRECONDITION_TEMPLATE_REGEX.match(line).group(0)
                uc_precond_words += self.tokenize_to_string_list(line[len(matched_string):])
                last_word_category = uc_precond_words
            elif self._dataset.UC_POSTCONDITION_TEMPLATE_REGEX.match(line):
                matched_string = self._dataset.UC_POSTCONDITION_TEMPLATE_REGEX.match(line).group(0)
                uc_postcond_words += self.tokenize_to_string_list(line[len(matched_string):])
                last_word_category = uc_postcond_words
            elif self._dataset.UC_FLOW_OF_EVENTS_TEMPLATE_REGEX.match(line):
                matched_string = self._dataset.UC_FLOW_OF_EVENTS_TEMPLATE_REGEX.match(line).group(0)
                uc_flow_of_events_words += self.tokenize_to_string_list(line[len(matched_string):])
                last_word_category = uc_flow_of_events_words
            elif self._dataset.UC_QUALI_REQ_TEMPLATE_REGEX.match(line):
                matched_string = self._dataset.UC_QUALI_REQ_TEMPLATE_REGEX.match(line).group(0)
                uc_quality_req_words += self.tokenize_to_string_list(line[len(matched_string):])
                last_word_category = uc_quality_req_words
            elif self._dataset.UC_USER_TEMPLATE_REGEX.match(line):
                # part of flow of events
                matched_string = self._dataset.UC_USER_TEMPLATE_REGEX.match(line).group(0)
                uc_flow_of_events_words += self.tokenize_to_string_list(line[len(matched_string):])
                last_word_category = uc_flow_of_events_words
            elif self._dataset.UC_SYSTEM_TEMPLATE_REGEX.match(line):
                # part of flow of events
                matched_string = self._dataset.UC_SYSTEM_TEMPLATE_REGEX.match(line).group(0)
                uc_flow_of_events_words += self.tokenize_to_string_list(line[len(matched_string):])
                last_word_category = uc_flow_of_events_words    
            else:
                last_word_category += self.tokenize_to_string_list(line)
                
        complete_uc_flow_of_events_words_string = " ".join(uc_flow_of_events_words)
        if self._italian:
            uc_flow_of_events_words = [word_tokenize(" ".join(sent.split("'")), language="italian") for sent in sent_tokenize(complete_uc_flow_of_events_words_string, language="italian")]
        else:
            uc_flow_of_events_words = [word_tokenize(sent) for sent in sent_tokenize(complete_uc_flow_of_events_words_string)]
        
        return UseCaseFileRepresentation(file_path, uc_name_words, uc_description_words, uc_actor_words, uc_precond_words, uc_postcond_words,
                                          uc_flow_of_events_words, uc_quality_req_words)
Exemplo n.º 35
0
 def tokenize(self, file_path) -> FileRepresentation:
     text_as_string = FileUtil.read_textfile_into_string(file_path, self._dataset.encoding())
     if self._italian:
         return TextFileRepresentation(sent_tokenize(text_as_string, language="italian"), file_path)
     return TextFileRepresentation(sent_tokenize(text_as_string), file_path)
Exemplo n.º 36
0
 def class_callgraph(self):
     return FileUtil.read_from_json(self.EANCI_CLASS_CALLGRAPH_PATH)