예제 #1
0
 def _exclude_selectors(self):
     """
     :return: list of css selectors
     """
     site_name = toolkit.get_site_name(self._url)
     site_exclude = self._rules[site_name]['exclude']
     return site_exclude + self._always_exlude
예제 #2
0
파일: main.py 프로젝트: shigarus/NewsParser
def main():
    # parse args
    parser = argparse.ArgumentParser()

    parser.add_argument('-u', '--url', help='Target page url')
    parser.add_argument('-t', '--target', help='Css selector to process text.')
    parser.add_argument('-e',
                        '--exclude',
                        help='Css selector to exclude text.')
    parser.add_argument('-c', '--config', help='Path to config file')
    parser.add_argument('-d', '--debug', action='store_true')

    parser.set_defaults(debug=False, config='config.json', exclude=None)
    args = parser.parse_args()
    # /parse args

    if args.debug:
        logging.basicConfig(level=logging.DEBUG)

    # getting config
    if os.path.exists(args.config):
        with codecs.open(args.config, 'r', encoding='utf-8') as fh:
            config = json.load(fh)
    else:
        config = dict(urls=[], rules={})

    # getting rules and urls for processing
    if args.url:
        url = args.url
        site_name = toolkit.get_site_name(url)
        if args.target:
            exclude = args.exclude
            rule = dict(include=[
                args.target,
            ],
                        exclude=[
                            exclude,
                        ] if exclude else [])
            rules = {site_name: rule}
        else:
            rules = config['rules']
        urls = [
            url,
        ]
    else:
        rules = config['rules']
        urls = config['urls']

    # process urls
    text_extractor = htmltoreadable.HtmlTextExtractor(rules)
    for url in urls:
        text = text_extractor.get_text(url)
        write_to_file(url, text)
예제 #3
0
 def _include_selectors(self):
     """
     :return: list of css selectors
     """
     site_name = toolkit.get_site_name(self._url)
     return self._rules[site_name]['include']