Exemplo n.º 1
0
 def parse(self, response):
     youbi_cd = date.today().weekday()
     for idx, bangumi_list in enumerate(response.css('td[valign="top"]')):
         # 対象外(当日の曜日以外)の番組情報は取得しない
         if idx != youbi_cd:
             continue
         item = NewsItem()
         text = ''
         # 取得対象の番組の概要テキストのみ取得
         for bangumi in bangumi_list.css('table.new_day'):
             bangumi_name = bangumi.css(
                 'span.prog_name a.bangumiDetailOpen::text').extract_first(
                 )
             if bangumi_name is None:
                 continue
             target_name = get_target_bangumi_name(bangumi_name.strip(),
                                                   TARGET_BANGUMI_DICT)
             if target_name is None:
                 continue
             print(bangumi_name)
             text = bangumi.css(
                 'span.expo_org a.bangumiDetailOpen::text').extract_first()
             if text is None:
                 continue
             text = remove_words(
                 text, TARGET_BANGUMI_DICT[target_name]['rm_word'],
                 KYOKU_SEP_DICT['asahi'])
             if text != '':
                 item['text'] = text
                 yield item
Exemplo n.º 2
0
 def parse(self, response):
     for bangumi in response.css('tbody td'):
         item = NewsItem()
         text = ''
         oa = bangumi.css('p.oa::text').extract_first()
         if oa is None:
             continue
         if re.match(r'\d{2}\:\d{2}', oa.strip()):
             bangumi_name = bangumi.css('h3::text').extract_first().split()
             # 取得対象の番組か判定
             target_name = get_target_bangumi_name(bangumi_name, TARGET_BANGUMI_DICT)
             if target_name is None:
                 continue
             # 番組概要を取得
             text = bangumi.css('p::text').extract()[1]
             if text is None:
                 continue
             text = remove_words(
                 text.strip(),
                 TARGET_BANGUMI_DICT[target_name]['rm_word'],
                 KYOKU_SEP_DICT['ntv']
             )
             desc_url = bangumi.css('a::attr(href)').extract_first().strip()
             yield response.follow(
                 desc_url,
                 callback=self.parse_desc_page,
                 meta={
                     'item': item,
                     'text': text,
                     'target_name': target_name
                 }
             )
Exemplo n.º 3
0
 def parse_desc_page(self, response):
     item = response.meta['item']
     target_name = response.meta['target_name']
     text = response.css('div.copy-box').css('p::text').extract_first()
     if not text is None:
         text = remove_words(text.strip(),
                             TARGET_BANGUMI_DICT[target_name]['rm_word'],
                             KYOKU_SEP_DICT['tbs'])
     if text != '':
         item['text'] = text
         yield item
Exemplo n.º 4
0
 def parse_desc_page(self, response):
     item = response.meta['item']
     text = response.meta['text']
     target_name = response.meta['target_name']
     text_tmp = ''
     for info in response.css('div.program'):
         if info.css('h2::text').extract_first() == '詳細':
             text_tmp = info.css('p::text').extract_first()
             break
     if (text_tmp != '') and (not text_tmp is None):
         text_tmp = remove_words(
             text_tmp.strip(),
             TARGET_BANGUMI_DICT[target_name]['rm_word'],
             KYOKU_SEP_DICT['ntv']
         )
         text = text_tmp
     if text != '':
         item['text'] = text
         yield item
Exemplo n.º 5
0
 def parse(self, response):
     for bangumi in response.css('div#wrap').css('td.info'):
         text = ''
         if bangumi.css('span.inform::text').extract_first() == '報道・情報':
             bangumi_name = bangumi.css('a::text').extract_first()
             # 取得対象の番組か判定
             target_name = get_target_bangumi_name(bangumi_name,
                                                   TARGET_BANGUMI_DICT)
             if target_name is None:
                 continue
             # 番組概要を取得
             text = bangumi.css('p.tx_pad::text').extract_first()
             if text is None:
                 continue
             text = remove_words(
                 text.strip(), TARGET_BANGUMI_DICT[target_name]['rm_word'],
                 KYOKU_SEP_DICT['fuji'])
         item = NewsItem()
         if text != '':
             item['text'] = text
             yield item
Exemplo n.º 6
0
def exract_target_data(f, contents_dict):
    '''必要な情報を抜き出してファイルに出力する'''

    for content in contents_dict['list']['g1']:
        # 取得対象の番組か判定
        target_name = get_target_bangumi_name(content['title'], TARGET_BANGUMI_DICT)
        if target_name is None:
            continue
        # 番組概要を取得
        print('target_name   ', target_name)
        text = ''
        for target_content in TARGET_BANGUMI_DICT[target_name]['target']:
            text += content[target_content] + KYOKU_SEP_DICT['nhk'][0]
        print('text  ', text)
        if text is None:
            continue
        text = remove_words(
            text.strip(),
            TARGET_BANGUMI_DICT[target_name]['rm_word'],
            KYOKU_SEP_DICT['nhk']
        )
        if text != '':
            f.write(text + '\n')
def main(args):
    print("Open data")
    df = pd.read_csv(args.datapath)

    # tokenize
    print("Tokenize text")
    tokenizer = NLTKToknizerWrapper(False)
    df["token"] = df["text"].apply(lambda t: tokenizer.tokenize(t.lower()))

    X = df["token"].values.tolist()
    y = df["label"].values.tolist()

    # optional preprocessing
    print("Preprocessing")
    X = [[remove_characters(token) for token in tokens] for tokens in X]
    X = [list(filter(lambda x: x != '', tokens)) for tokens in X]
    id_stopwords = stopwords.words('indonesian')
    X = [remove_words(tokens, id_stopwords) for tokens in X]

    # split
    print("Split Data")
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        test_size=0.2,
                                                        random_state=4371)
    X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                      y_train,
                                                      train_size=0.9,
                                                      test_size=0.1,
                                                      random_state=4371)
    data = X_train, y_train, X_test, y_test, X_val, y_val
    if args.architecture == "rnn":
        rnn_classify_demo.main(args, data)
    elif args.architecture == "cnn":
        cnn_classify_demo.main(args, data)