Exemplo n.º 1
 def test_find_executable_bad_version(self):
         'pipeline runner',
         ['./run-pipeline', '../run-pipeline'],
Exemplo n.º 2
 def test_find_executable(self):
         'pipeline runner',
         ['./run-pipeline', '../run-pipeline'],
Exemplo n.º 3
    def test_find_executable_bad_version(self):
        if seesaw.six.PY3:
            exes = ['./run-pipeline3', '../run-pipeline3']
            exes = ['./run-pipeline', '../run-pipeline']

            'pipeline runner',
Exemplo n.º 4
    def test_find_executable(self):
        if seesaw.six.PY3:
            exes = ['./run-pipeline3', '../run-pipeline3']
            exes = ['./run-pipeline', '../run-pipeline']

            'pipeline runner',
Exemplo n.º 5
    def test_find_executable_regex_version(self):
        if seesaw.six.PY3:
            exes = ['./run-pipeline3', '../run-pipeline3']
            exes = ['./run-pipeline', '../run-pipeline']

            'pipeline runner',
            re.compile(seesaw.__version__.replace('.', '\\.')),
Exemplo n.º 6
from archivebot import control
from archivebot import shared_config
from archivebot.seesaw import extensions
from archivebot.seesaw import monitoring
from archivebot.seesaw.wpull import WpullArgs
from archivebot.seesaw.tasks import GetItemFromQueue, StartHeartbeat, \
    SetFetchDepth, PreparePaths, WriteInfo, DownloadUrlFile, \
    RelabelIfAborted, MoveFiles, SetWarcFileSizeInRedis, StopHeartbeat, \

VERSION = "20140915.01"
EXPIRE_TIME = 60 * 60 * 48  # 48 hours between archive requests
WPULL_EXE = find_executable('Wpull', None, [ './wpull' ])
PHANTOMJS = find_executable('PhantomJS', PHANTOMJS_VERSION,
        ['phantomjs', './phantomjs', '../phantomjs'], '-v')

version_integer = (sys.version_info.major * 10) + sys.version_info.minor

assert version_integer >= 33, \
        "This pipeline requires Python >= 3.3.  You are running %s." % \

assert WPULL_EXE, 'No usable Wpull found.'
assert PHANTOMJS, 'PhantomJS %s was not found.' % PHANTOMJS_VERSION
assert 'RSYNC_URL' in env, 'RSYNC_URL not set.'
assert 'REDIS_URL' in env, 'REDIS_URL not set.'

if StrictVersion(seesaw.__version__) < StrictVersion("0.1.8b1"):
Exemplo n.º 7
import json

from os import environ as env
from urlparse import urlparse
from seesaw.project import *
from seesaw.item import *
from seesaw.task import *
from seesaw.pipeline import *
from seesaw.externalprocess import *

from seesaw.util import find_executable

VERSION = "20131101.01"
USER_AGENT = "ArchiveTeam ArchiveBot/%s" % VERSION
EXPIRE_TIME = 60 * 60 * 48  # 48 hours between archive requests
WGET_LUA = find_executable("Wget+Lua", "GNU Wget 1.14.0-archivebot1", ["./wget-lua"])

if not WGET_LUA:
    raise Exception("No usable Wget+Lua found.")

if "RSYNC_URL" not in env:
    raise Exception("RSYNC_URL not set.")

if "REDIS_URL" not in env:
    raise Exception("REDIS_URL not set.")

if "LOG_CHANNEL" not in env:
    raise Exception("LOG_CHANNEL not set.")

Exemplo n.º 8
from config import *
from depcheck import *


# Find a useful grabProject executable.
GRAB_TEST = find_executable(

# The version number of this pipeline definition.
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = "20151123.05"
USER_AGENT = 'ArchiveTeam'
TRACKER_ID = 'googlecodersync'
Exemplo n.º 9
    raise Exception('This pipeline needs seesaw version 0.8.5 or higher.')

# Find a useful Wget+Lua executable.
# WGET_LUA will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WGET_LUA = find_executable(
    ['GNU Wget 1.14.lua.20130523-9a5c', 'GNU Wget 1.14.lua.20160530-955376b'],

if not WGET_LUA:
    raise Exception('No usable Wget+Lua found.')

# The version number of this pipeline definition.
# Update this each time you make a non-cosmetic change.
Exemplo n.º 10
    raise Exception("This pipeline needs seesaw version 0.1.5 or higher.")

# Find a useful Wget+Lua executable.
# WGET_LUA will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WGET_LUA = find_executable(
    ["GNU Wget 1.14.lua.20130523-9a5c"],

if not WGET_LUA:
    raise Exception("No usable Wget+Lua found.")

# Determine if FFMPEG is available
# Should probably utilize an ffmpeg build (or source) distributed from the
# repo to avoid nasty API incompatibilities between FFMPEG versions.
# However, if the options used are relatively simple, using distro-provided
Exemplo n.º 11
import json

from os import environ as env
from urlparse import urlparse
from seesaw.project import *
from seesaw.item import *
from seesaw.task import *
from seesaw.pipeline import *
from seesaw.externalprocess import *

from seesaw.util import find_executable

VERSION = "20131101.01"
USER_AGENT = "ArchiveTeam ArchiveBot/%s" % VERSION
EXPIRE_TIME = 60 * 60 * 48  # 48 hours between archive requests
WGET_LUA = find_executable('Wget+Lua', "GNU Wget 1.14.0-archivebot1",

if not WGET_LUA:
    raise Exception("No usable Wget+Lua found.")

if 'RSYNC_URL' not in env:
    raise Exception('RSYNC_URL not set.')

if 'REDIS_URL' not in env:
    raise Exception('REDIS_URL not set.')

if 'LOG_CHANNEL' not in env:
    raise Exception('LOG_CHANNEL not set.')

Exemplo n.º 12
    raise Exception('This pipeline needs seesaw version 0.8.5 or higher.')

# Find a useful Wget+Lua executable.
# WGET_AT will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string

WGET_AT = find_executable(
        'GNU Wget 1.20.3-at.20211001.01',
        'GNU Wget 1.21.3-at.20220503.02'

if not WGET_AT:
    raise Exception('No usable Wget+At found.')

# The version number of this pipeline definition.
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
from seesaw.pipeline import Pipeline
from seesaw.project import Project
from seesaw.util import find_executable

if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'):
    raise Exception('This pipeline needs seesaw version 0.8.5 or higher.')

# Find a useful Wget+Lua executable.
# WGET_AT will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string

WGET_AT = find_executable('Wget+AT', [
    'GNU Wget 1.20.3-at.20210212.02',
], ['./wget-at', '/home/warrior/data/wget-at'])

if not WGET_AT:
    raise Exception('No usable Wget+At found.')

# The version number of this pipeline definition.
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = '20210325.06'
USER_AGENT = 'Archive Team (ircs://irc.hackint.org#nintendone https://webirc.hackint.org/#irc://irc.hackint.org/#nintendone)'
TRACKER_ID = 'super-mario-maker-bookmarks'
TRACKER_HOST = 'legacy-api.arpa.li'
Exemplo n.º 14
# check the seesaw version
if StrictVersion(seesaw.__version__) < StrictVersion("0.8.5"):
    raise Exception("This pipeline needs seesaw version 0.8.5 or higher.")

# Find a useful Wpull executable.
# WPULL_EXE will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WPULL_EXE = find_executable("Wpull", re.compile(r"\b1\.2\.3\b"), [
YOUTUBE_DL_EXE = find_executable(
    None,  # No version requirements
Exemplo n.º 15
from seesaw.externalprocess import WgetDownload
from seesaw.pipeline import Pipeline
from seesaw.project import Project
from seesaw.util import find_executable

if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'):
    raise Exception('This pipeline needs seesaw version 0.8.5 or higher.')

# Find a useful Wget+Lua executable.
# WGET_AT will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string

WGET_AT = find_executable('Wget+AT', ['GNU Wget 1.20.3-at.20201030.01'],
                          ['./wget-at', '/home/warrior/data/wget-at'])

if not WGET_AT:
    raise Exception('No usable Wget+At found.')

# The version number of this pipeline definition.
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = '20210203.05'
USER_AGENT = 'Archive Team'
TRACKER_ID = 'halo-new'
TRACKER_HOST = 'legacy-api.arpa.li'
Exemplo n.º 16
import shutil
import json

from tornado.httpclient import HTTPClient, HTTPRequest

from seesaw.project import *
from seesaw.item import *
from seesaw.config import *
from seesaw.task import *
from seesaw.pipeline import *
from seesaw.externalprocess import *
from seesaw.tracker import *

from seesaw.util import find_executable

WGET_LUA = find_executable('wget-lua', '1.14.lua.20130523-9a5c',
                           ['./wget-lua', 'wget-lua'])

CURL = find_executable('curl', '7.2', ['curl'])

if not WGET_LUA:
    raise Exception("wget-lua cannot be found")

if not CURL:
    raise Exception("curl cannot be found")

# ----

DATA_DIR = "data"
USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:23.0) Gecko/20100101 Firefox/23.0"
VERSION = "20130910.01"
TRACKER = "http://quilt.at.ninjawedding.org/patchy"
Exemplo n.º 17
from seesaw.externalprocess import WgetDownload, ExternalProcess
from seesaw.pipeline import Pipeline
from seesaw.project import Project
from seesaw.util import find_executable

# check the seesaw version
if StrictVersion(seesaw.__version__) < StrictVersion("0.8.5"):
    raise Exception("This pipeline needs seesaw version 0.8.5 or higher.")

# Find a useful rsync executable
RSYNC = find_executable(

#if not RSYNC:
#    raise Exception("No usable rsync found.")

# The version number of this pipeline definition.
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = "20150614.01"
USER_AGENT = 'ArchiveTeam'
TRACKER_ID = 'sourceforge-rsync'
Exemplo n.º 18
from seesaw.pipeline import Pipeline
from seesaw.project import Project
from seesaw.util import find_executable

# check the seesaw version
if StrictVersion(seesaw.__version__) < StrictVersion("0.1.5"):
    raise Exception("This pipeline needs seesaw version 0.1.5 or higher.")

# Find a useful Wget+Lua executable.
# WGET_LUA will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WGET_LUA = find_executable("Wget+Lua", ["GNU Wget 1.14.lua.20130523-9a5c"], [
    "./wget-lua", "./wget-lua-warrior", "./wget-lua-local", "../wget-lua",
    "../../wget-lua", "/home/warrior/wget-lua", "/usr/bin/wget-lua"

if not WGET_LUA:
    raise Exception("No usable Wget+Lua found.")

# Determine if FFMPEG is available
# Should probably utilize an ffmpeg build (or source) distributed from the
# repo to avoid nasty API incompatibilities between FFMPEG versions.
# However, if the options used are relatively simple, using distro-provided
# ffmpeg builds shouldn't be too problematic. Just be sure to add a metadata
# WARCRecord indicating the version string of the ffmpeg that is used.
FFMPEG = find_executable(
    "ffmpeg", ["ffmpeg version 2"],
    ["/usr/bin/ffmpeg", "/usr/local/bin/ffmpeg", "./ffmpeg"], "-version")
Exemplo n.º 19
# nice, though.

from archivebot import control
from archivebot import shared_config
from archivebot.seesaw import extensions
from archivebot.seesaw import monitoring
from archivebot.seesaw.preflight import check_wpull_args
from archivebot.seesaw.wpull import WpullArgs
from archivebot.seesaw.tasks import GetItemFromQueue, StartHeartbeat, \
    SetFetchDepth, PreparePaths, Wpull, CompressLogIfFailed, WriteInfo, DownloadUrlFile, \
    RelabelIfAborted, MoveFiles, StopHeartbeat, MarkItemAsDone, CheckIP, CheckLocalWebserver

WPULL_VERSION = ('2.0.3')
EXPIRE_TIME = 60 * 60 * 48  # 48 hours between archive requests
WPULL_EXE = find_executable('Wpull', WPULL_VERSION, ['wpull', './wpull'],
YOUTUBE_DL = find_executable('youtube-dl', None, ['./youtube-dl'], '--version')

version_integer = (sys.version_info.major * 10) + sys.version_info.minor

assert version_integer >= 33, \
        "This pipeline requires Python >= 3.3.  You are running %s." % \

if not os.environ.get('NO_SEGFAULT_340'):
    assert sys.version_info[:3] != (3, 4, 0), \
        "Python 3.4.0 should not be used. It may segfault. " \
        "Set NO_SEGFAULT_340=1 if your Python is patched. " \
        "See https://bugs.python.org/issue21435"

assert WPULL_EXE, 'No usable Wpull found.'
Exemplo n.º 20

# check the seesaw version
if StrictVersion(seesaw.__version__) < StrictVersion("0.1.5"):
	raise Exception("This pipeline needs seesaw version 0.1.5 or higher.")

# Find a useful rsync_size_tester executable.
RSYNC_TEST = find_executable(

#Yes this is hackish but run-pipeline won't let you add more command line args
#If the file "LARGE-RSYNC" is in the directory, allow larger rsync's
#Using Gigabytes not Gibibytes to be safe
if os.path.isfile("LARGE-RSYNC"):
	MAX_RSYNC = "150000000000"
	MAX_RSYNC = "25000000000"

Exemplo n.º 21
# check the seesaw version
if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'):
    raise Exception('This pipeline needs seesaw version 0.8.5 or higher.')

# Find a useful Wget+Lua executable.
# WGET_LUA will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WGET_AT = find_executable(
        'GNU Wget 1.20.3-at.20200401.01',
        'GNU Wget 1.20.3-at.20200804.01',
        'GNU Wget 1.20.3-at.20200902.01'

if not WGET_AT:
    raise Exception('No usable Wget+Lua found.')

# The version number of this pipeline definition.
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = '20200902.01'
Exemplo n.º 22
from seesaw.config import *
from seesaw.item import *
from seesaw.task import *
from seesaw.pipeline import *
from seesaw.externalprocess import *
from seesaw.tracker import *
from seesaw.util import find_executable

WGET_LUA = find_executable(
    "GNU Wget 1.14.lua.20130120-8476",

if not WGET_LUA:
    raise Exception("No usable Wget+Lua found.")

USER_AGENT = "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27"
VERSION = "20130129.01"

Exemplo n.º 23
    raise Exception('This pipeline needs seesaw version 0.8.5 or higher.')

# Find a useful Wget+Lua executable.
# WGET_LUA will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WGET_LUA = find_executable(
    ['GNU Wget 1.14.lua.20130523-9a5c', 'GNU Wget 1.14.lua.20160530-955376b'],

if not WGET_LUA:
    raise Exception('No usable Wget+Lua found.')

# The version number of this pipeline definition.
# Update this each time you make a non-cosmetic change.
Exemplo n.º 24
from seesaw.tracker import GetItemFromTracker, PrepareStatsForTracker, \
    UploadWithTracker, SendDoneToTracker
from seesaw.util import find_executable
import zstandard

if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'):
    raise Exception('This pipeline needs seesaw version 0.8.5 or higher.')

# Find a useful Wget+Lua executable.
# WGET_AT will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string

WGET_AT = find_executable('Wget+AT', ['GNU Wget 1.20.3-at.20201030.01'],

if not WGET_AT:
    raise Exception('No usable Wget+At found.')

# The version number of this pipeline definition.
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = '20201112.01'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
TRACKER_ID = 'urls'
TRACKER_HOST = 'trackerproxy.archiveteam.org'
Exemplo n.º 25

from archivebot import control
from archivebot import shared_config
from archivebot.seesaw import extensions
from archivebot.seesaw import monitoring
from archivebot.seesaw.preflight import check_wpull_args
from archivebot.seesaw.wpull import WpullArgs
from archivebot.seesaw.tasks import GetItemFromQueue, StartHeartbeat, \
    SetFetchDepth, PreparePaths, WriteInfo, DownloadUrlFile, \
    RelabelIfAborted, MoveFiles, StopHeartbeat, MarkItemAsDone, CheckIP

VERSION = "20150715.01"
EXPIRE_TIME = 60 * 60 * 48  # 48 hours between archive requests
WPULL_EXE = find_executable('Wpull', None, ['./wpull'])
PHANTOMJS = find_executable('PhantomJS', PHANTOMJS_VERSION,
        ['phantomjs', './phantomjs', '../phantomjs'], '-v')
YOUTUBE_DL = find_executable('youtube-dl', None, ['./youtube-dl'], '--version')

version_integer = (sys.version_info.major * 10) + sys.version_info.minor

assert version_integer >= 33, \
        "This pipeline requires Python >= 3.3.  You are running %s." % \

if not os.environ.get('NO_SEGFAULT_340'):
    assert sys.version_info[:3] != (3, 4, 0), \
        "Python 3.4.0 should not be used. It may segfault. " \
        "Set NO_SEGFAULT_340=1 if your Python is patched. " \
        "See https://bugs.python.org/issue21435"
Exemplo n.º 26
import json

from os import environ as env
from urlparse import urlparse
from seesaw.project import *
from seesaw.item import *
from seesaw.task import *
from seesaw.pipeline import *
from seesaw.externalprocess import *

from seesaw.util import find_executable

VERSION = "20140119.01"
USER_AGENT = "ArchiveTeam ArchiveBot/%s" % VERSION
EXPIRE_TIME = 60 * 60 * 48  # 48 hours between archive requests
WGET_LUA = find_executable('Wget+Lua', "GNU Wget 1.14.0-archivebot1",
        [ './wget-lua' ])

if not WGET_LUA:
    raise Exception("No usable Wget+Lua found.")

if 'RSYNC_URL' not in env:
    raise Exception('RSYNC_URL not set.')

if 'REDIS_URL' not in env:
    raise Exception('REDIS_URL not set.')

if 'LOG_CHANNEL' not in env:
    raise Exception('LOG_CHANNEL not set.')

Exemplo n.º 27
from seesaw.pipeline import Pipeline
from seesaw.project import Project
from seesaw.util import find_executable

# check the seesaw version
if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'):
    raise Exception('This pipeline needs seesaw version 0.8.5 or higher.')

# Find a useful Wget+Lua executable.
# WGET_LUA will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WGET_LUA = find_executable('Wget+Lua', ['GNU Wget 1.20.3-at-lua'], [
    './wget-lua', './wget-lua-warrior', './wget-lua-local', '../wget-lua',
    '../../wget-lua', '/home/warrior/wget-lua', '/usr/bin/wget-lua'

if not WGET_LUA:
    raise Exception('No usable Wget+Lua found.')

# The version number of this pipeline definition.
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = '20190925.01'
USER_AGENT = 'ArchiveTeam'
TRACKER_ID = 'sketch'
TRACKER_HOST = 'tracker.archiveteam.org'
    UploadWithTracker, SendDoneToTracker
from seesaw.util import find_executable

# check the seesaw version
if StrictVersion(seesaw.__version__) < StrictVersion("0.8.5"):
    raise Exception("This pipeline needs seesaw version 0.8.5 or higher.")

# Find a useful Wpull executable.
# WPULL_EXE will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
PYTHON35_EXE = find_executable("Python 3.5", re.compile(r"^Python 3\.5"), [

if not PYTHON35_EXE:
    raise Exception("No usable python3.5 library found.")
if not os.environ.get('s3access'):
    raise Exception("s3 access key missing")
if not os.environ.get('s3secret'):
    raise Exception("s3 secret key missing")

# The version number of this pipeline definition.
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
Exemplo n.º 29
if StrictVersion(seesaw.__version__) < StrictVersion("0.8.3"):
    raise Exception("This pipeline needs seesaw version 0.8.3 or higher.")

# Find a useful Wpull executable.
# WPULL_EXE will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WPULL_EXE = find_executable(

if not WPULL_EXE:
    raise Exception("No usable Wpull found.")

# The version number of this pipeline definition.
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
Exemplo n.º 30
    UploadWithTracker, SendDoneToTracker
from tornado.ioloop import IOLoop
import zstandard

if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'):
    raise Exception('This pipeline needs seesaw version 0.8.5 or higher.')

# Find a useful Wget+Lua executable.
# WGET_AT will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string

WGET_AT = find_executable('Wget+AT', [
    'GNU Wget 1.20.3-at.20200902.01', 'GNU Wget 1.20.3-at.20200917.01',
    'GNU Wget 1.20.3-at.20200919.01', 'GNU Wget 1.20.3-at.20201030.01'
], ['./wget-at', '/usr/local/bin/wget-at'])

if not WGET_AT:
    raise Exception('No usable Wget+At found.')

# The version number of this pipeline definition.
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = '20210114.01'
USER_AGENT = 'Archive Team'
TRACKER_ID = 'github'
TRACKER_HOST = 'trackerproxy.archiveteam.org'
Exemplo n.º 31
from seesaw.util import find_executable

# check the seesaw version
if StrictVersion(seesaw.__version__) < StrictVersion("0.8.5"):
    raise Exception("This pipeline needs seesaw version 0.8.5 or higher.")

# Find a useful Wget+Lua executable.
# WGET_LUA will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WGET_LUA = find_executable(
    ["GNU Wget 1.14.lua.20130523-9a5c", "GNU Wget 1.14.lua.20160530-955376b"],
        "./wget-lua", "./wget-lua-warrior", "./wget-lua-local", "../wget-lua",
        "../../wget-lua", "/home/warrior/wget-lua", "/usr/bin/wget-lua"

if not WGET_LUA:
    raise Exception("No usable Wget+Lua found.")

# The version number of this pipeline definition.
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = "20161212.01"
USER_AGENT = 'ArchiveTeam'
TRACKER_ID = 'exua'
Exemplo n.º 32
from seesaw.util import find_executable

# check the seesaw version
if StrictVersion(seesaw.__version__) < StrictVersion("0.8.5"):
    raise Exception("This pipeline needs seesaw version 0.8.5 or higher.")

# Find a useful Wpull executable.
# WPULL_EXE will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WPULL_EXE = find_executable("Wpull", re.compile(r"\b1\.2\.3\b"), [

if not WPULL_EXE:
    raise Exception("No usable Wpull found.")

# The version number of this pipeline definition.
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = "20170826.01"
TRACKER_ID = 'newsgrabber'
TRACKER_HOST = 'tracker.archiveteam.org'
Exemplo n.º 33
# import requests

# check the seesaw version
if StrictVersion(seesaw.__version__) < StrictVersion('0.10.3'):
    raise Exception('This pipeline needs seesaw version 0.10.3 or higher.')

# Find a useful Wget+Lua executable.
# WGET_LUA will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
PYTHON = find_executable(
    'Python3', ['Python 3.8', 'Python 3.7', 'Python 3.6', 'Python 3.5'], [

if not PYTHON:
    raise Exception('No usable Python 3 found.')

# The version number of this pipeline definition.
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = '20191202.02'
USER_AGENT = 'ArchiveTeam'
TRACKER_ID = 'yahoo-groups-api'
# TRACKER_HOST = 'tracker.archiveteam.org'  #prod-env
Exemplo n.º 34
      return path
  return None

# Find a useful Wget+Lua executable.
# WGET_LUA will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WGET_LUA = find_executable("Wget+Lua",
    [ "GNU Wget 1.14.lua.20130120-8476",
      "GNU Wget 1.14.lua.20130407-1f1d",
      "GNU Wget 1.14.lua.20130427-92d2",
      "GNU Wget 1.14.lua.20130523-9a5c" ],
    [ "./wget-lua",
      "/usr/bin/wget-lua" ])

if not WGET_LUA:
  raise Exception("No usable Wget+Lua found.")

# The user agent for external requests.
# Use this constant in the Wget command line.
USER_AGENT = "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.28"
Exemplo n.º 35
    raise Exception("This pipeline needs seesaw version 0.8.5 or higher.")

# Find a useful Wget+Lua executable.
# WGET_LUA will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WGET_LUA = find_executable(
    ["GNU Wget 1.14.lua.20130523-9a5c"],

if not WGET_LUA:
    raise Exception("No usable Wget+Lua found.")

# The version number of this pipeline definition.
# Update this each time you make a non-cosmetic change.
Exemplo n.º 36

from archivebot import control
from archivebot import shared_config
from archivebot.seesaw import extensions
from archivebot.seesaw import monitoring
from archivebot.seesaw.preflight import check_wpull_args
from archivebot.seesaw.wpull import WpullArgs
from archivebot.seesaw.tasks import GetItemFromQueue, StartHeartbeat, \
    SetFetchDepth, PreparePaths, WriteInfo, DownloadUrlFile, \
    RelabelIfAborted, MoveFiles, StopHeartbeat, MarkItemAsDone, CheckIP

VERSION = "20150424.01"
EXPIRE_TIME = 60 * 60 * 48  # 48 hours between archive requests
WPULL_EXE = find_executable('Wpull', None, [ './wpull' ])
PHANTOMJS = find_executable('PhantomJS', PHANTOMJS_VERSION,
        ['phantomjs', './phantomjs', '../phantomjs'], '-v')

version_integer = (sys.version_info.major * 10) + sys.version_info.minor

assert version_integer >= 33, \
        "This pipeline requires Python >= 3.3.  You are running %s." % \

if not os.environ.get('NO_SEGFAULT_340'):
    assert sys.version_info[:3] != (3, 4, 0), \
        "Python 3.4.0 should not be used. It may segfault. " \
        "Set NO_SEGFAULT_340=1 if your Python is patched. " \
        "See https://bugs.python.org/issue21435"
Exemplo n.º 37
# nice, though.

from archivebot import control
from archivebot import shared_config
from archivebot.seesaw import extensions
from archivebot.seesaw import monitoring
from archivebot.seesaw.tasks import GetItemFromQueue, StartHeartbeat, \
    SetFetchDepth, PreparePaths, WriteInfo, DownloadUrlFile, \
    RelabelIfAborted, MoveFiles, SetWarcFileSizeInRedis, StopHeartbeat, \

VERSION = "20140819.03"
EXPIRE_TIME = 60 * 60 * 48  # 48 hours between archive requests
WPULL_EXE = find_executable('Wpull', None, [ './wpull' ])
PHANTOMJS = find_executable('PhantomJS', '1.9.7',
        ['phantomjs', './phantomjs'], '-v')

version_integer = (sys.version_info.major * 10) + sys.version_info.minor

assert version_integer >= 33, \
        "This pipeline requires Python >= 3.3.  You are running %s." % \

assert WPULL_EXE, 'No usable Wpull found.'
assert PHANTOMJS, 'PhantomJS 1.9.0 was not found.'
assert 'RSYNC_URL' in env, 'RSYNC_URL not set.'
assert 'REDIS_URL' in env, 'REDIS_URL not set.'

if StrictVersion(seesaw.__version__) < StrictVersion("0.1.8b1"):
Exemplo n.º 38
import shutil
import json

from tornado.httpclient import HTTPClient, HTTPRequest

from seesaw.project import *
from seesaw.item import *
from seesaw.config import *
from seesaw.task import *
from seesaw.pipeline import *
from seesaw.externalprocess import *
from seesaw.tracker import *

from seesaw.util import find_executable

WGET_LUA = find_executable('wget-lua', '1.14.lua.20130523-9a5c',
                           ['./wget-lua', 'wget-lua'])

CURL = find_executable('curl', '7.2', ['curl'])

if not WGET_LUA:
    raise Exception("wget-lua cannot be found")

if not CURL:
    raise Exception("curl cannot be found")

# ----

DATA_DIR = "data"
USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:23.0) Gecko/20100101 Firefox/23.0"
VERSION = "20130910.01"
TRACKER = "http://quilt.at.ninjawedding.org/patchy"