Пример #1
0
def fetch_labeled_data():
    str_old_time = "2015-08-01 00:00:00"
    str_new_time = "2016-11-31 00:00:00"
    proj_name = "article_cat"

    LabeledCrawler(proj_name=proj_name).rebuild_table()
    LabeledCrawlerIheima(proj_name=proj_name).crawl(str_old_time, str_new_time)
    LabeledCrawlerKanchai(proj_name=proj_name).crawl(str_old_time,
                                                     str_new_time)
    LabeledCrawlerLeiphone(proj_name=proj_name).crawl(str_old_time,
                                                      str_new_time)
    LabeledCrawlerLieyun(proj_name=proj_name).crawl(str_old_time, str_new_time)
    LabeledCrawlerSootoo(proj_name=proj_name).crawl(str_old_time, str_new_time)
    LabeledCrawlerYiou(proj_name=proj_name).crawl(str_old_time, str_new_time)
    LabeledCrawler7tin(proj_name=proj_name).crawl("2000-08-01 00:00:00",
                                                  str_new_time)
    LabeledCrawlerAilab(proj_name=proj_name).crawl("2000-08-01 00:00:00",
                                                   str_new_time)
    LabeledCrawlerBaidu(proj_name=proj_name).crawl("2000-08-01 00:00:00",
                                                   str_new_time)
    LabeledCrawlerSinaVR(proj_name=proj_name).crawl("2000-08-01 00:00:00",
                                                    str_new_time)
    LabeledCrawlerVarkr(proj_name=proj_name).crawl("2000-08-01 00:00:00",
                                                   str_new_time)

    seg = Segmenter(proj_name=proj_name)
    seg.seg(skip_exist=True)
    seg.join_segfile()
Пример #2
0
def fetch_nonlabeled_data():
    proj_name = "article150801160830"
    str_old_time = "2015-08-01 00:00:00"
    str_new_time = "2016-12-31 00:00:00"

    # Crawler(proj_name=proj_name).rebuild_table()
    # Crawler163(proj_name=proj_name).crawl(str_old_time, str_new_time)
    # Crawler36Kr(proj_name=proj_name).crawl(str_old_time, str_new_time)
    # CrawlerGeekPark(proj_name=proj_name).crawl(str_old_time, str_new_time)
    # CrawlerLeiphone(proj_name=proj_name).crawl(str_old_time, str_new_time)
    # CrawlerKanchai(proj_name=proj_name).crawl(str_old_time, str_new_time)
    # CrawlerHuxiu(proj_name=proj_name).crawl(str_old_time, str_new_time)

    LabeledCrawlerIheima(proj_name=proj_name).crawl(str_old_time, str_new_time)
    LabeledCrawlerKanchai(proj_name=proj_name).crawl(str_old_time,
                                                     str_new_time)
    LabeledCrawlerLeiphone(proj_name=proj_name).crawl(str_old_time,
                                                      str_new_time)
    LabeledCrawlerLieyun(proj_name=proj_name).crawl(str_old_time, str_new_time)
    LabeledCrawlerSootoo(proj_name=proj_name).crawl(str_old_time, str_new_time)
    LabeledCrawlerYiou(proj_name=proj_name).crawl(str_old_time, str_new_time)
    LabeledCrawler7tin(proj_name=proj_name).crawl("2000-08-01 00:00:00",
                                                  str_new_time)
    LabeledCrawlerAilab(proj_name=proj_name).crawl("2000-08-01 00:00:00",
                                                   str_new_time)
    LabeledCrawlerBaidu(proj_name=proj_name).crawl("2000-08-01 00:00:00",
                                                   str_new_time)
    LabeledCrawlerSinaVR(proj_name=proj_name).crawl("2000-08-01 00:00:00",
                                                    str_new_time)
    LabeledCrawlerVarkr(proj_name=proj_name).crawl("2000-08-01 00:00:00",
                                                   str_new_time)

    seg = Segmenter(proj_name=proj_name)
    seg.seg(skip_exist=True)
    seg.join_segfile()