示例#1
0
def main(args):
    current_path = os.getcwd()
    logging.info(f'current python path {current_path}...')

    logging.info('Load data...')
    train_csv = pd.read_csv(f'{current_path}/data/tbrain_train_final_0603.csv')

    logging.info('Data preprocessing...')
    train_csv = pd.concat(
        [train_csv, pd.DataFrame(['dummy' for i in range(5023)])], axis=1)
    train_csv.columns = list(train_csv.columns[:-1]) + ['article']
    train_csv['domain'] = train_csv['hyperlink'].apply(
        lambda x: x.split('//')[1].split('/')[0])

    logging.info('Start crawling...')
    crawler = Crawler()
    crawler.crawling_process(train_csv, test=False)

    logging.info(f'Saving to {args.save_path}...')
    train_csv.to_csv(f'{args.save_path}', index=False)

    logging.info('Finish!')