url, inline_css=False, parser_name='html.parser') items = doc.extract_href() urls = [] for item in items: if 'en.wikipedia.org' not in item: continue if item in urls: continue urls.append(item) yield item if __name__ == '__main__': parser = ManagedArgumentParser() parser.add_argument('-u', '--url', help='Start url.') parser.add_argument('-f', '--urls_file', help="The urls' file") parser.add_argument('-la', '--language', choices=['en', 'fr', 'de'], default='en', help='Select language for text[en|fr|de]') args = parser.parse_args() if args.urls_file: with open(args.urls_file) as f: urls = f.readlines() else: urls = [args.url] with ManagedArgumentParser.api_by_args(args) as api:
from api.data_model import Domain, Site from api.arg_parser import ManagedArgumentParser if __name__ == '__main__': parser = ManagedArgumentParser() parser.add_argument('-f', '--file_name', help="The domains in file will be found the upstream sites info") parser.add_argument('-d', '--domain', help="There will be found the upstream site info for this domain") parser.set_defaults(dry_run=False, confirm=False, draw_graph=False, sanity_check=False) args = parser.parse_args() assert args.file_name or args.domain, 'Must input a arg' if args.file_name: with open(args.file_name) as f: domains = [domain.strip() for domain in f.readlines()] elif args.domain: domains = [args.domain] with ManagedArgumentParser.api_by_args(args) as api: upstreams = {} for d in domains: domain = Domain.get(api.session(), d) assert domain is not None, '%s is invalid' % d upstreams.setdefault(domain.domain, []) if domain.get_upstream_domains(): for upstream in domain.get_upstream_domains(): upstreams[domain.domain].append(upstream.domain) for d in upstreams.items():
continue if tldextract.extract(link).domain == self_domain: continue if tldextract.extract(link).domain in famous_link_domains: continue for s in unexpected_str: if s in link: break else: if furl(link).scheme and furl(link).host and furl(link).host.split('.')[-2] != self_domain: links.append(link) return set(links) if __name__ == '__main__': parser = ManagedArgumentParser() parser.add_argument('-fo', '--famous_outlink', help='The famous_outlink is used to crawl resources.') parser.add_argument('-fof', '--famous_outlinks_file', help="The famous_outlinks' file") parser.add_argument('-t', '--type', choices=['image', 'javascript'], required=True, help='The type of resources') parser.add_argument('-fru', '--famous_resource_url', help='The famous resource to be added') parser.add_argument('-fruf', '--famous_resource_urls_file', help="The famous resource urls' file to be added") parser.set_defaults(draw_graph=False) args = parser.parse_args() with ManagedArgumentParser.api_by_args(args) as api: if args.famous_outlink or args.famous_outlinks_file: urls = [] if args.famous_outlink: urls.append(args.famous_outlink) else: with open(args.famous_outlinks_file) as f:
from api.data_model import Domain, Site from api.utils import fetch_custom_page from api.arg_parser import ManagedArgumentParser if __name__ == '__main__': # url = 'http://www.aboutbay.com/custom_page' # custom_url = 'http://www.aboutbay.com/hello-the-war/' # text_md5 = "8c042a69b6b14fadd2dd76dcbbac8e02" # insert_extra_links = 1 # famous_third_party_resource_url = None # links = [ # "http://www.wellingtonna.com/lots-of-crunching-and/", # "http://www.wellingtonna.com/i-ve-heard-it/" # ] # fetch_custom_page(url, custom_url, text_md5, links, insert_extra_links, famous_third_party_resource_url) parser = ManagedArgumentParser() parser.set_defaults(confirm=False, draw_graph=False, test_google=False, sanity_check=False, dry_run=False) args = parser.parse_args() with ManagedArgumentParser.api_by_args(args) as api: links_to = ['http://www.wellingtonna.com/lots-of-crunching-and/'] bucket_name = 'halo' site = Site.get(api.session(), 'aboutbay.com') content = site.serialize_to_static_data(links_to, bucket_name) print(content)
import traceback from datetime import datetime, timedelta from api.data_model import Domain, ExtraTargetSite from api.arg_parser import ManagedArgumentParser if __name__ == '__main__': parser = ManagedArgumentParser() parser.add_argument('-sd', '--source_domains', default='/tmp/no_renew_domains') parser.set_defaults(draw_graph=False) args = parser.parse_args() with ManagedArgumentParser.api_by_args(args) as api: with open(args.source_domains) as f: source_domains = [d.strip() for d in f.readlines()] for d in source_domains: domain = Domain.get(api.session(), d) extra_target_site = ExtraTargetSite.get(api.session(), d) if domain and not extra_target_site: registered_timestamp = domain.registered_timestamp elif extra_target_site: registered_timestamp = extra_target_site.create_timestamp else: continue expire_time = registered_timestamp + timedelta(days=365) now_time = datetime.now() expire_days = expire_time - now_time if expire_days.days <= 0: if domain and domain.status != 'abandoned': print('%s expired at %s, update %s -> %s' %
from datetime import datetime from api.data_model import Site from api.arg_parser import ManagedArgumentParser if __name__ == '__main__': parser = ManagedArgumentParser() parser.add_argument('-d', '--domain', help="The domains you want to check the block_expire") parser.add_argument( '-f', '--domains_file', help="The domains' file you want to check the block_expire") parser.add_argument('--modify', dest='modify', action='store_true') parser.add_argument('--no_modify', dest='modify', action='store_false') parser.set_defaults(dry_run=True, draw_graph=False, sanity_check=False, modify=False) args = parser.parse_args() with ManagedArgumentParser.api_by_args(args) as api: domains = [] if args.domain: domains.append(args.domain) elif args.domains_file: with open(args.domains_file) as f: domains = f.readlines() assert len(domains) > 0, "Must input a domain or a domains' file" for domain in domains:
import os from api.arg_parser import ManagedArgumentParser from api.data_model import TargetSiteURL, TargetSiteURLRedirect, P2URL table_names = { 'target_site_url': TargetSiteURL, 'target_site_url_redirect': TargetSiteURLRedirect, 'p2_url': P2URL, } if __name__ == '__main__': parser = ManagedArgumentParser() parser.add_argument('-n', '--number') parser.add_argument('-t', '--table', choices=table_names.keys(), required=True) parser.set_defaults(dry_run=True, confirm=False, draw_graph=False, sanity_check=False) args = parser.parse_args() table_name = table_names[args.table] with ManagedArgumentParser.api_by_args(args) as api: if args.number: target_site = api.session().query(table_name).limit(args.number) else: target_site = api.session().query(table_name).all()