Exemplo n.º 1
0
 def gen_docs(jsons: [{}],
              char_wb: bool = False,
              add_taint: bool = False) -> [Learner.LabelledDocs]:
     """
     Generate string list from the flow URLs.
     :param jsons: The flow jsons.
     :param char_wb:
     :param add_taint: Whether add taints as tokens.
     :return:
     """
     docs = []
     taint_counts = 0
     for flow in jsons:
         line = Analyzer.filter_url_words(flow['url'])
         if '_' in flow['taint']:
             taint_counts += 1
         if add_taint:
             line = line + ' ' + 't_' + flow['taint']
         label = 1 if flow['label'] == '1' else 0
         real_label = 1 if flow['real_label'] == '1' else 0
         if real_label != label:
             logger.info(
                 "Flow's real label does not match the training label for %s, real_label = %d label = %d",
                 flow['url'], real_label, label)
         numeric = [flow[name] for name in Analyzer.numeric_features]
         docs.append(
             Learner.LabelledDocs(line,
                                  label,
                                  numeric,
                                  real_label,
                                  char_wb=char_wb))
     logger.info('The number of flows who have more than 1 taints: %d',
                 taint_counts)
     return docs