Exemplo n.º 1
0
    def __init__(self,
                 logger,
                 job,
                 db=None,
                 queue=None,
                 manager=None,
                 modules=None):
        """
		Basic init, just make sure our thread name is meaningful

		:param Database db:  Database connection - if not given, a new one will be created
		:param JobQueue queue: Job Queue - if not given, a new one will be instantiated
		:param WorkerManager manager:  Worker manager reference
		"""
        super().__init__()
        self.name = self.type
        self.log = logger
        self.manager = manager
        self.job = job
        self.init_time = int(time.time())

        # all_modules cannot be easily imported into a worker because all_modules itself
        # imports all workers, so you get a recursive import that Python (rightly) blocks
        # so for workers, all_modules' content is passed as a constructor argument
        self.all_modules = modules

        self.db = Database(logger=self.log,
                           appname=self.type) if not db else db
        self.queue = JobQueue(logger=self.log,
                              database=self.db) if not queue else queue
Exemplo n.º 2
0
                 required=True,
                 help="Name of SQLite table containing threads.")
cli.add_argument("-p",
                 "--posts_table",
                 type=str,
                 required=True,
                 help="Name of the SQLite table containing posts.")
cli.add_argument("-b", "--board", type=str, required=True, help="Board name")
args = cli.parse_args()

if not Path(args.input).exists() or not Path(args.input).is_file():
    print("%s is not a valid folder name." % args.input)
    sys.exit(1)

logger = Logger()
db = Database(logger=logger, appname="queue-dump")

seen_post_ids = set()

# Columns from 4archive dumps
posts_columns = ["id", "chan_id", "threads_id", "chan_image_name", "image_size", \
"image_dimensions", "thumb_dimensions", "image_url", "original_image_name", "subject", \
 "name", "chan_user_id", "tripcode", "capcode", "chan_post_date", "body", "available"]
threads_columns = ["id", "thread_id", "board", "archive_date", "update_date", "user_ips", \
"times_updated", "views", "admin_note", "secret", "available", "alive", "takedown_reason", \
"busy", "tweeted"]

conn = sqlite3.connect(args.input)
print("Connected to SQLite database.")

count = 0
Exemplo n.º 3
0
    default=True)

args = cli.parse_args()
args.truncate = bool(args.truncate)
limit = int(args.limit)

sourcefile = Path(args.input)
if not sourcefile.exists():
    print("The file %s does not exist" % sourcefile)
    exit(1)

dbconn = sqlite3.connect(args.input)
dbconn.row_factory = sqlite3.Row
cursor = dbconn.cursor()

db = Database(logger=Logger())
db.execute(open("database.sql").read())
if args.truncate:
    db.execute("TRUNCATE posts_usenet")
    db.execute("TRUNCATE threads_usenet")
    db.execute("TRUNCATE groups_usenet")
db.commit()

post_to_threads = {}
posts = cursor.execute("SELECT * FROM postsdata")

print("Loading posts....")
done = 0
while posts:
    post = posts.fetchone()
    if not post or (limit and done > limit):
Exemplo n.º 4
0
                 required=True,
                 help="Datasource ID")
cli.add_argument("-b", "--board", type=str, required=True, help="Board name")
args = cli.parse_args()

if not Path(args.input).exists() or not Path(args.input).is_dir():
    print("%s is not a valid folder name." % args.input)
    sys.exit(1)

input = Path(args.input).resolve()
jsons = input.glob("*.json")

print("Initialising queue...")
logger = Logger()
queue = JobQueue(logger=logger,
                 database=Database(logger=logger, appname="queue-folder"))

print("Adding files to queue...")
files = 0
deadline = time.time()
for file in jsons:
    files += 1
    file = str(file)
    queue.add_job(args.datasource + "-thread",
                  remote_id=file,
                  details={
                      "board": args.board,
                      "file": str(file)
                  },
                  claim_after=int(deadline))
    deadline += 0.1
Exemplo n.º 5
0
def run(as_daemon=True):
    if not as_daemon:
        indent_spaces = round(shutil.get_terminal_size().columns / 2) - 33
        indent = "".join([" " for i in range(0, indent_spaces)
                          ]) if indent_spaces > 0 else ""
        print("\n\n")
        print(
            indent +
            "+---------------------------------------------------------------+"
        )
        print(
            indent +
            "|                                                               |"
        )
        print(
            indent +
            "|                           welcome to                          |"
        )
        print(
            indent +
            "|                                                               |"
        )
        print(
            indent +
            "|                  j88D   .o88b.  .d8b.  d888888b               |"
        )
        print(
            indent +
            "|                 j8~88  d8P  Y8 d8' `8b `~~88~~'               |"
        )
        print(
            indent +
            "|                j8' 88  8P      88ooo88    88                  |"
        )
        print(
            indent +
            "|                V88888D 8b      88~~~88    88                  |"
        )
        print(
            indent +
            "|                    88  Y8b  d8 88   88    88                  |"
        )
        print(
            indent +
            "|                    VP   `Y88P' YP   YP    YP                  |"
        )
        print(
            indent +
            "|                                                               |"
        )
        print(
            indent +
            "|               4CAT: Capture and Analysis Toolkit              |"
        )
        print(
            indent +
            "|                                                               |"
        )
        print(
            indent +
            "|                                                               |"
        )
        print(
            indent +
            "+---------------------------------------------------------------+"
        )
        print(
            indent +
            "|                  press q + enter to shut down                 |"
        )
        print(
            indent +
            "|                                                               |"
        )
        print(
            indent +
            "| WARNING: Not running as a daemon.  Quitting this process will |"
        )
        print(
            indent +
            "|                 shut down the backend as well.                |"
        )
        print(
            indent +
            "+---------------------------------------------------------------+\n\n"
        )

    # load everything
    log = Logger(output=not as_daemon)
    db = Database(logger=log, appname="main")
    queue = JobQueue(logger=log, database=db)

    # clean up after ourselves
    db.commit()
    queue.release_all()

    # make it happen
    WorkerManager(logger=log, database=db, queue=queue, as_daemon=as_daemon)
    log.info("4CAT Backend shut down.")
Exemplo n.º 6
0
# this should have been done in the 1.9 -> 1.10 migration script, but alas...
from backend.lib.database import Database
from backend.lib.logger import Logger

import psycopg2
import config

log = Logger(output=True)
db = Database(logger=log, dbname=config.DB_NAME, user=config.DB_USER, password=config.DB_PASSWORD, host=config.DB_HOST, port=config.DB_PORT, appname="4cat-migrate")

for datasource in ("4chan", "8kun", "8chan"):
	print("  Checking for %s database tables... " % datasource, end="")

	test = db.fetchone("SELECT EXISTS ( SELECT FROM information_schema.tables WHERE table_schema = %s AND table_name = %s )", ("public", "posts_%s" % datasource))
	if not test["exists"]:
		print("not available, nothing to upgrade!")
		continue

	print("  Checking if required columns exist... ", end="")
	columns = [row["column_name"] for row in db.fetchall("SELECT column_name FROM information_schema.columns WHERE table_name = %s", ("posts_%s" % datasource,))]
	if "image_url" in columns:
		print("yes!")
	else:
		print(" adding 'image_url' column to %s posts table" % datasource)
		db.execute("ALTER TABLE posts_%s ADD COLUMN image_url TEXT DEFAULT NONE" % datasource)
Exemplo n.º 7
0
from backend.lib.database import Database
from backend.lib.logger import Logger

import psycopg2
import config

log = Logger(output=True)
db = Database(logger=log,
              dbname=config.DB_NAME,
              user=config.DB_USER,
              password=config.DB_PASSWORD,
              host=config.DB_HOST,
              port=config.DB_PORT,
              appname="4cat-migrate")

print("  Checking for 4chan database tables... ", end="")
try:
    test = db.fetchone("SELECT * FROM posts_4chan LIMIT 1")
except psycopg2.ProgrammingError:
    print("not available, nothing to upgrade!")
    exit(0)

print("\n  Adding 'board' column to 4chan posts table")
db.execute("ALTER TABLE posts_4chan ADD COLUMN board TEXT DEFAULT ''")

print("  Filling 'board' column")
db.execute(
    "UPDATE posts_4chan SET board = ( SELECT board FROM threads_4chan WHERE id = posts_4chan.thread_id )"
)

print("  Creating index")
Exemplo n.º 8
0
# update database structure for chan tables to save post deletion timestamp
# separately from main table
from backend.lib.database import Database
from backend.lib.logger import Logger

import psycopg2
import config

log = Logger(output=True)
db = Database(logger=log,
              dbname=config.DB_NAME,
              user=config.DB_USER,
              password=config.DB_PASSWORD,
              host=config.DB_HOST,
              port=config.DB_PORT,
              appname="4cat-migrate")

for datasource in ("4chan", "8kun", "8chan"):
    print("  Checking for %s database tables... " % datasource, end="")
    test = db.fetchone(
        "SELECT EXISTS ( SELECT FROM information_schema.tables WHERE table_schema = %s AND table_name = %s )",
        ("public", "posts_%s" % datasource))
    if not test["exists"]:
        print("not available, nothing to upgrade!")
        continue

    print("\n  Checking if required table exists... ", end="")
    test = db.fetchone(
        "SELECT EXISTS ( SELECT FROM information_schema.tables WHERE table_schema = %s AND table_name = %s )",
        ("public", "posts_%s_deleted" % datasource))
    columns = [
Exemplo n.º 9
0
    description="Deletes a query, the corresponding job, and any sub-queries.")
cli.add_argument("-k", "--key", required=True, help="Query key to delete.")
cli.add_argument(
    "-q",
    "--quiet",
    type=bool,
    default=False,
    help="Whether to skip asking for confirmation. Defaults to false.")
args = cli.parse_args()

if not args.quiet:
    confirm = input(
        "This will delete the query, and any sub-queries. Are you sure? (y/n)")
    if confirm.strip().lower() != "y":
        sys.exit(0)

logger = Logger()
database = Database(logger=logger, appname="delete-query")

# Initialize query
try:
    parent = DataSet(key=args.key, db=database)
except TypeError:
    print("No query found with that key.")
    sys.exit(1)

parent.delete()
print(
    "Done. Note that running jobs for the queries above are not stopped; you will have to wait for them to finish on their own."
)
Exemplo n.º 10
0
                 required=True,
                 help="File to read from, containing a CSV dump")
cli.add_argument("-d",
                 "--datasource",
                 type=str,
                 required=True,
                 help="Datasource ID")
cli.add_argument("-b", "--board", type=str, required=True, help="Board name")
args = cli.parse_args()

if not Path(args.input).exists() or not Path(args.input).is_file():
    print("%s is not a valid folder name." % args.input)
    sys.exit(1)

logger = Logger()
db = Database(logger=logger, appname="queue-dump")

csvnone = re.compile(r"^N$")

seen_post_ids = set()
with open(args.input, encoding="utf-8") as inputfile:
    fieldnames = ("num", "subnum", "thread_num", "op", "timestamp",
                  "timestamp_expired", "preview_orig", "preview_w",
                  "preview_h", "media_filename", "media_w", "media_h",
                  "media_size", "media_hash", "media_orig", "spoiler",
                  "deleted", "capcode", "email", "name", "trip", "title",
                  "comment", "sticky", "locked", "poster_hash",
                  "poster_country", "exif")
    reader = csv.DictReader(inputfile,
                            fieldnames=fieldnames,
                            doublequote=False,
Exemplo n.º 11
0
    required=True,
    help="Folder to read from, containing JSON files representing threads")
cli.add_argument("-d",
                 "--datasource",
                 type=str,
                 required=True,
                 help="Datasource ID")
cli.add_argument("-b", "--board", type=str, required=True, help="Board name")
args = cli.parse_args()

if not Path(args.input).exists() or not Path(args.input).is_dir():
    print("%s is not a valid folder name." % args.input)
    sys.exit(1)

logger = Logger()
db = Database(logger=logger, appname="queue-folder")
folder = Path(args.input)

for jsonfile in folder.glob("*.json"):
    db.commit()

    try:
        with jsonfile.open() as input:
            posts = json.load(input)["posts"]
    except json.JSONDecodeError:
        print("ERROR PARSING FILE - SKIPPING: %s" % jsonfile)
        continue

    if not posts:
        print("Empy thread %s, skipping." % jsonfile)
        continue
Exemplo n.º 12
0
# this should have been done in the 1.9 -> 1.10 migration script, but alas...
from backend.lib.database import Database
from backend.lib.logger import Logger

import psycopg2
import config

log = Logger(output=True)
db = Database(logger=log,
              dbname=config.DB_NAME,
              user=config.DB_USER,
              password=config.DB_PASSWORD,
              host=config.DB_HOST,
              port=config.DB_PORT,
              appname="4cat-migrate")

for datasource in ("8kun", "8chan"):
    print("  Checking for %s database tables... " % datasource, end="")

    test = db.fetchone(
        "SELECT EXISTS ( SELECT FROM information_schema.tables WHERE table_schema = %s AND table_name = %s )",
        ("public", "posts_%s" % datasource))
    if not test["exists"]:
        print("not available, nothing to upgrade!")
        continue

    print("  Checking if required columns exist... ", end="")
    columns = [
        row["column_name"] for row in db.fetchall(
            "SELECT column_name FROM information_schema.columns WHERE table_name = %s",
            ("posts_%s" % datasource, ))
Exemplo n.º 13
0
from flask_limiter import Limiter
from flask_limiter.util import get_remote_address

import config

from backend.lib.database import Database
from backend.lib.logger import Logger
from backend.lib.queue import JobQueue

database_name = config.DB_NAME_TEST if hasattr(
    config.FlaskConfig,
    "DEBUG") and config.FlaskConfig.DEBUG == "Test" else config.DB_NAME
login_manager = LoginManager()
app = Flask(__name__)
log = Logger()
db = Database(logger=log, dbname=database_name, appname="frontend")
queue = JobQueue(logger=log, database=db)

# initialize openapi endpoint collector for later specification generation
from webtool.lib.openapi_collector import OpenAPICollector
openapi = OpenAPICollector(app)

# initialize rate limiter
limiter = Limiter(app, key_func=get_remote_address)

# make sure a secret key was set in the config file, for secure session cookies
if config.FlaskConfig.SECRET_KEY == "REPLACE_THIS":
    raise Exception(
        "You need to set a FLASK_SECRET in config.py before running the web tool."
    )
Exemplo n.º 14
0
cli.add_argument(
    "-f",
    "--fast",
    default=False,
    type=bool,
    help=
    "Use batch queries instead of inserting posts individually. This is far faster than 'slow' mode, "
    "but will crash if trying to insert a duplicate post, so it should only be used on an empty "
    "database or when you're sure datasets don't overlap.")
args = cli.parse_args()

if not os.path.exists(args.input):
    print("File not found: %s" % args.input)
    sys.exit(1)

db = Database(logger=Logger(), appname="4chan-import")

print("Opening %s." % args.input)
if args.skip > 0:
    print("Skipping %i posts." % args.skip)

if args.fast:
    print("Fast mode enabled.")

with open(args.input, encoding="utf-8") as inputfile:
    postscsv = csv.DictReader(inputfile,
                              fieldnames=FourPlebs.columns,
                              dialect=FourPlebs)

    postbuffer = []
    threads = {}
Exemplo n.º 15
0
from pathlib import Path

cli = argparse.ArgumentParser()
cli.add_argument("-i", "--input", required=True, help="csv to import")
args = cli.parse_args()

input = Path(args.input)
if not input.exists():
	print("File not found")
	sys.exit(1)

with open(input) as i:
	reader = csv.DictReader(i)
	rows = 0
	for row in reader:
		rows += 1

required = ("id", "thread_id", "subject", "author", "timestamp", "body")
for field in required:
	if field not in reader.fieldnames:
		print("Column '%s' missing." % field)
		sys.exit(1)

logger = Logger()
new_set = DataSet(
	parameters={"user": "******", "filename": input.name, "time": int(time.time()), "datasource": "custom",
				"board": "upload"}, type="custom",
	db=Database(logger=logger))

shutil.copyfile(input, new_set.get_results_path())
new_set.finish(rows)
Exemplo n.º 16
0
"""
import argparse
import psycopg2
import time
import sys
import re
import os

sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)) + "/..")

from backend.lib.database import Database
from backend.lib.logger import Logger
from webtool.lib.user import User

log = Logger()
db = Database(logger=log, appname="create-user")

cli = argparse.ArgumentParser()
cli.add_argument("-u", "--username", required=True, help="Name of user (must be unique)")

args = cli.parse_args()

if __name__ != "__main__":
	sys.exit(1)

if not re.match(r"[^@]+\@.*?\.[a-zA-Z]+", args.username):
	print("Please provide an e-mail address as username.")
	sys.exit(1)

try:
	db.insert("users", data={"name": args.username, "timestamp_token": int(time.time())})
Exemplo n.º 17
0
from backend.lib.database import Database
from backend.lib.logger import Logger

import psycopg2, psycopg2
import config

log = Logger(output=True)
db = Database(logger=log, dbname=config.DB_NAME, user=config.DB_USER, password=config.DB_PASSWORD, host=config.DB_HOST, port=config.DB_PORT, appname="4cat-migrate")

print("  Checking for 4chan database tables... ", end="")
try:
	test = db.fetchone("SELECT * FROM posts_4chan LIMIT 1")
except psycopg2.ProgrammingError:
	print("not available, nothing to upgrade!")
	exit(0)

print("  Checking if required columns exist... ", end="")
columns = [row["column_name"] for row in db.fetchall("SELECT column_name FROM information_schema.columns WHERE table_name = %s", ("posts_4chan",))]
if "board" in columns:
	print("yes!")
else:
	print(" adding 'board' column to 4chan posts table")
	db.execute("ALTER TABLE posts_4chan ADD COLUMN board TEXT DEFAULT ''")

print("  Filling 'board' column")
db.execute("UPDATE posts_4chan SET board = ( SELECT board FROM threads_4chan WHERE id = posts_4chan.thread_id )")

print("  Creating index")
db.execute("CREATE UNIQUE INDEX IF NOT EXISTS posts_4chan_id ON posts_4chan ( id, board )")

print("  Making sure nltk packages are present...")