Пример #1
0
                    url,
                    inline_css=False,
                    parser_name='html.parser')
    items = doc.extract_href()
    urls = []
    for item in items:
        if 'en.wikipedia.org' not in item:
            continue
        if item in urls:
            continue
        urls.append(item)
        yield item


if __name__ == '__main__':
    parser = ManagedArgumentParser()
    parser.add_argument('-u', '--url', help='Start url.')
    parser.add_argument('-f', '--urls_file', help="The urls' file")
    parser.add_argument('-la',
                        '--language',
                        choices=['en', 'fr', 'de'],
                        default='en',
                        help='Select language for text[en|fr|de]')
    args = parser.parse_args()

    if args.urls_file:
        with open(args.urls_file) as f:
            urls = f.readlines()
    else:
        urls = [args.url]
    with ManagedArgumentParser.api_by_args(args) as api:
Пример #2
0
from api.data_model import Domain, Site
from api.arg_parser import ManagedArgumentParser

if __name__ == '__main__':
    parser = ManagedArgumentParser()
    parser.add_argument('-f', '--file_name',
                        help="The domains in file will be found the upstream sites info")
    parser.add_argument('-d', '--domain',
                        help="There will be found the upstream site info for this domain")
    parser.set_defaults(dry_run=False, confirm=False, draw_graph=False,
                        sanity_check=False)
    args = parser.parse_args()

    assert args.file_name or args.domain, 'Must input a arg'
    if args.file_name:
        with open(args.file_name) as f:
            domains = [domain.strip() for domain in f.readlines()]
    elif args.domain:
        domains = [args.domain]

    with ManagedArgumentParser.api_by_args(args) as api:
        upstreams = {}
        for d in domains:
            domain = Domain.get(api.session(), d)
            assert domain is not None, '%s is invalid' % d
            upstreams.setdefault(domain.domain, [])
            if domain.get_upstream_domains():
                for upstream in domain.get_upstream_domains():
                    upstreams[domain.domain].append(upstream.domain)

        for d in upstreams.items():
Пример #3
0
            continue
        if tldextract.extract(link).domain == self_domain:
            continue
        if tldextract.extract(link).domain in famous_link_domains:
            continue
        for s in unexpected_str:
            if s in link:
                break
        else:
            if furl(link).scheme and furl(link).host and furl(link).host.split('.')[-2] != self_domain:
                links.append(link)
    return set(links)


if __name__ == '__main__':
    parser = ManagedArgumentParser()
    parser.add_argument('-fo', '--famous_outlink', help='The famous_outlink is used to crawl resources.')
    parser.add_argument('-fof', '--famous_outlinks_file', help="The famous_outlinks' file")
    parser.add_argument('-t', '--type', choices=['image', 'javascript'], required=True, help='The type of resources')
    parser.add_argument('-fru', '--famous_resource_url', help='The famous resource to be added')
    parser.add_argument('-fruf', '--famous_resource_urls_file', help="The famous resource urls' file to be added")
    parser.set_defaults(draw_graph=False)
    args = parser.parse_args()

    with ManagedArgumentParser.api_by_args(args) as api:
        if args.famous_outlink or args.famous_outlinks_file:
            urls = []
            if args.famous_outlink:
                urls.append(args.famous_outlink)
            else:
                with open(args.famous_outlinks_file) as f:
Пример #4
0
from api.data_model import Domain, Site
from api.utils import fetch_custom_page
from api.arg_parser import ManagedArgumentParser

if __name__ == '__main__':
    #        url = 'http://www.aboutbay.com/custom_page'
    #        custom_url = 'http://www.aboutbay.com/hello-the-war/'
    #        text_md5 = "8c042a69b6b14fadd2dd76dcbbac8e02"
    #        insert_extra_links = 1
    #        famous_third_party_resource_url = None
    #        links = [
    #                "http://www.wellingtonna.com/lots-of-crunching-and/",
    #                "http://www.wellingtonna.com/i-ve-heard-it/"
    #                ]
    #        fetch_custom_page(url, custom_url, text_md5, links, insert_extra_links, famous_third_party_resource_url)
    parser = ManagedArgumentParser()
    parser.set_defaults(confirm=False, draw_graph=False, test_google=False, sanity_check=False, dry_run=False)
    args = parser.parse_args()
    with ManagedArgumentParser.api_by_args(args) as api:
        links_to = ['http://www.wellingtonna.com/lots-of-crunching-and/']
        bucket_name = 'halo'
        site = Site.get(api.session(), 'aboutbay.com')
        content = site.serialize_to_static_data(links_to, bucket_name)
        print(content)

Пример #5
0
import traceback
from datetime import datetime, timedelta
from api.data_model import Domain, ExtraTargetSite
from api.arg_parser import ManagedArgumentParser

if __name__ == '__main__':
    parser = ManagedArgumentParser()
    parser.add_argument('-sd',
                        '--source_domains',
                        default='/tmp/no_renew_domains')
    parser.set_defaults(draw_graph=False)
    args = parser.parse_args()

    with ManagedArgumentParser.api_by_args(args) as api:
        with open(args.source_domains) as f:
            source_domains = [d.strip() for d in f.readlines()]
        for d in source_domains:
            domain = Domain.get(api.session(), d)
            extra_target_site = ExtraTargetSite.get(api.session(), d)
            if domain and not extra_target_site:
                registered_timestamp = domain.registered_timestamp
            elif extra_target_site:
                registered_timestamp = extra_target_site.create_timestamp
            else:
                continue
            expire_time = registered_timestamp + timedelta(days=365)
            now_time = datetime.now()
            expire_days = expire_time - now_time
            if expire_days.days <= 0:
                if domain and domain.status != 'abandoned':
                    print('%s expired at %s, update %s -> %s' %
Пример #6
0
from datetime import datetime
from api.data_model import Site
from api.arg_parser import ManagedArgumentParser

if __name__ == '__main__':
    parser = ManagedArgumentParser()
    parser.add_argument('-d',
                        '--domain',
                        help="The domains you want to check the block_expire")
    parser.add_argument(
        '-f',
        '--domains_file',
        help="The domains' file you want to check the block_expire")
    parser.add_argument('--modify', dest='modify', action='store_true')
    parser.add_argument('--no_modify', dest='modify', action='store_false')
    parser.set_defaults(dry_run=True,
                        draw_graph=False,
                        sanity_check=False,
                        modify=False)
    args = parser.parse_args()

    with ManagedArgumentParser.api_by_args(args) as api:
        domains = []
        if args.domain:
            domains.append(args.domain)
        elif args.domains_file:
            with open(args.domains_file) as f:
                domains = f.readlines()
        assert len(domains) > 0, "Must input a domain or a domains' file"

        for domain in domains:
Пример #7
0
import os

from api.arg_parser import ManagedArgumentParser
from api.data_model import TargetSiteURL, TargetSiteURLRedirect, P2URL

table_names = {
    'target_site_url': TargetSiteURL,
    'target_site_url_redirect': TargetSiteURLRedirect,
    'p2_url': P2URL,
}

if __name__ == '__main__':
    parser = ManagedArgumentParser()
    parser.add_argument('-n', '--number')
    parser.add_argument('-t',
                        '--table',
                        choices=table_names.keys(),
                        required=True)
    parser.set_defaults(dry_run=True,
                        confirm=False,
                        draw_graph=False,
                        sanity_check=False)

    args = parser.parse_args()
    table_name = table_names[args.table]

    with ManagedArgumentParser.api_by_args(args) as api:
        if args.number:
            target_site = api.session().query(table_name).limit(args.number)
        else:
            target_site = api.session().query(table_name).all()