예제 #1
0
 def __init__(self, path, file_name):
     conf = config.Config()
     data_path = conf.get_config('system', 'data_path')
     data_path = approot.get_root() + data_path
     # 获取文本
     xml_file = xml.dom.minidom.parse(data_path + path + file_name)
     self.xml_doc = xml_file.documentElement
예제 #2
0
def load_data():
    c = config.Config()
    train_data = []
    diretory = c.data_dir + c.external_data + 'spam_detection/'
    data_files = [diretory + 'Youtube01-Psy.csv',diretory + 'Youtube02-KatyPerry.csv',diretory + 'Youtube03-LMFAO.csv',diretory + 'Youtube04-Eminem.csv',diretory + 'Youtube05-Shakira.csv']
    for file in data_files:
        data = pd.read_csv(file)
        train_data.append(data)

    train_data.append(pd.read_csv(diretory + 'SMSSpamCollection.csv', engine = 'python'))
    train_data = pd.concat(train_data)

    return train_data
예제 #3
0
    def __init__(self):
        # 初始化配置类
        conf = config.Config()
        constants._init()
        now_time = datetime.datetime.now()

        # 设置当次测试日志输出的文件夹与文件
        log_path = conf.get_config('system', 'log_path')
        log_folder = log_path + now_time.strftime('%Y-%m-%d')
        log_file = now_time.strftime('%H%M%S')

        constants.set_value('log_folder', log_folder)
        constants.set_value('log_file', log_file)

        # 设置当次测试截图输出的文件夹
        screenshot_path = conf.get_config('system', 'screenshot_path')
        screenshot_folder = screenshot_path + now_time.strftime(
            '%Y-%m-%d_%H%M%S')
        constants.set_value('screenshot_folder', screenshot_folder)

        # 设置当次测试excel报告输出的文件
        excel_report_path = conf.get_config('system', 'excel_report_path')
        excel_report_folder = excel_report_path + now_time.strftime('%Y-%m-%d')
        excel_report_file = now_time.strftime('%H%M%S')
        constants.set_value('excel_report_folder', excel_report_folder)
        constants.set_value('excel_report_file', excel_report_file)

        # html文件存放路径
        html_report_path = conf.get_config('system', 'html_report_path')
        constants.set_value('html_report_path', html_report_path)

        excel_report_folder = approot.get_root() + excel_report_folder

        # 创建导出excel报告的文件夹
        if not os.path.exists(excel_report_folder):
            os.makedirs(excel_report_folder)

        # 设置驱动
        driver_class = driver.Driver()
        my_driver = driver_class.get_driver(
            conf.get_config('driver', 'default'))
        constants.set_value('my_driver', my_driver)
예제 #4
0
    except Exception:
        return 'no_language_detected'


def remove_ponctuation(text):
    if text is not None and isinstance(text, str):
        return text.translate(str.maketrans('', '',
                                            string.punctuation)).lower()
    else:
        None


if __name__ == '__main__':
    logging.getLogger().setLevel(logging.INFO)

    c = config.Config()

    data = pd.read_csv(c.data_dir + c.raw_data + c.comments_file,
                       engine='python')

    # Drop duplicates
    data.drop_duplicates(subset='cid', inplace=True, keep='last')

    # Description cleanning
    data['text'] = data['text'].apply(remove_ponctuation)
    logging.info('Punctuation removed!')

    # description classifier
    data['text_level'] = data['text'].apply(length_description).apply(
        description_classifier)