def command(cls, config_ini, org_names):
        common.load_config(config_ini)
        common.register_translator()
        from ckan.plugins import toolkit
        from ckan import model
        orgs = [toolkit.get_action('organization_show')(
                data_dict={'id': org_name})
                for org_name in org_names]
        source_org, dest_org = orgs
        assert source_org
        assert dest_org
        search_results = toolkit.get_action('package_search')(
            data_dict=dict(fq='publisher:%s' % source_org['name'], rows=1000))
        print 'Datasets: %s' % search_results['count']
        stats = Stats()
        if len(search_results['results']) != search_results['count']:
            assert 0, 'need to implement paging'

        #context = {
        #    'user': get_script_user(__name__)['name'],
        #    'ignore_auth': True,
        #    'model': model}
        rev = model.repo.new_revision()
        rev.author = 'script-%s.py' % __file__
        for dataset in search_results['results']:
            model.Package.get(dataset['id']).owner_org = dest_org['id']
            #dataset_ = toolkit.get_action('package_patch')(
            #    context=context,
            #    data_dict=dict(id=dataset['id'], owner_org=dest_org['id']))
            print stats.add('Changed owner_org', dataset['name'])
        print stats.report()
        print 'Writing'
        model.Session.commit()
Exemplo n.º 2
0
    def command(cls, config_ini, write):
        common.load_config(config_ini)
        common.register_translator()

        rev = model.repo.new_revision()
        rev.author = "script-fix_mandate.py"

        for package in model.Session.query(model.Package).filter(model.Package.state == "active"):
            if "mandate" in package.extras:

                mandate = package.extras.get("mandate")
                try:
                    mandate = json.loads(mandate)
                    if isinstance(mandate, list):
                        stats.add("Already list", package.name)
                    elif isinstance(mandate, basestring):
                        stats.add("Fixing JSON string", package.name)
                        package.extras["mandate"] = json.dumps([mandate])
                    else:
                        stats.add("Problem JSON", package.name)
                except ValueError:
                    if mandate != "":
                        stats.add("Fixing string", package.name)
                        package.extras["mandate"] = json.dumps([mandate])
                    else:
                        stats.add("Deleting empty string", package.name)
                        del package.extras["mandate"]
            else:
                stats.add("No mandate field", package.name)

        print stats.report()

        if write:
            print "Writing"
            model.Session.commit()
Exemplo n.º 3
0
    def command(cls, config_ini, write):
        common.load_config(config_ini)
        common.register_translator()

        rev = model.repo.new_revision()
        rev.author = 'fix_contact_details.py'

        for package in model.Session.query(model.Package).filter_by(state='active'):
            group = package.get_organization()
            if not group:
                stats.add('was not in a group', package.name)
                continue

            if package.extras.get('contact-name') == group.extras.get('contact-name'):
                if package_is_effected(package, group):
                    if write:
                        package.extras['contact-name'] = ''
                        package.extras['contact-email'] = ''
                        package.extras['contact-phone'] = ''
                        package.extras['foi-name'] = ''
                        package.extras['foi-email'] = ''
                        package.extras['foi-web'] = ''
                        package.extras['foi-phone'] = ''
                    stats.add('resetting', 'Resetting package %s' % package.name)

        print stats.report()
        if write:
            model.Session.commit()
Exemplo n.º 4
0
    def command(cls, config_ini, write, options):
        common.load_config(config_ini)
        common.register_translator()

        rev = model.repo.new_revision()
        rev.author = 'script-delete_cache_filepath.py'

        process_all = True
        if options.resource:
            cls.process_resource(model.Resource.get(options.resource))
            process_all = False
        else:
            # Get each dataset,
            counter = 0
            datasets_q =  model.Session.query(model.Package) \
                    .filter_by(state='active')

            rounded = int(math.ceil(datasets_q.count() / 100.0)) * 100
            for x in xrange(0, rounded, 100):
                datasets = datasets_q.offset(x).limit(100)
                updated = False

                for dataset in datasets.all():
                    counter += 1

                    print "Processing dataset %d\r" % counter,
                    for resource in dataset.resources:
                        if cls.process_resource(resource):
                            updated = True

                    for key in dataset_properties_to_make_null:
                        if getattr(dataset, key):
                            stats_dp.add('Making property null: %s' % key, dataset.name)
                            setattr(dataset, key, None)
                            updated = True
                        else:
                            stats_dp.add('Property has no value: %s' % key, dataset.name)

                    for key in dataset_extras_to_remove:
                        if key in dataset.extras:
                            #stats_de.add('Removing: %s' % key, dataset.name)
                            del dataset.extras[key]
                            updated = True
                        else:
                            stats_de.add('No field to remove: %s' % key, dataset.name)

                # We will be committing 100 at a time
                if updated and write:
                    print "\nCommitting changes"
                    import time
                    s = time.time()
                    model.Session.commit()
                    print "Committed in ", time.time() - s

        print 'Resource Properties:\n', stats_rp.report(show_time_taken=False)
        print 'Resource Extras:\n', stats_re.report()
        print 'Dataset Properties:\n', stats_dp.report(show_time_taken=False)
        print 'Dataset Extras:\n', stats_de.report()
def get_datasets_from_ckan(domain):
    common.load_config(config_ini)
    common.register_translator()

    from pylons import config
    apikey = config['dgu.merge_datasets.apikey']
    ckan = ckanapi.RemoteCKAN('https://%s' % domain, apikey=apikey)
    datasets = ckan.action.package_search(q='organogram', rows=400)
    return datasets
    def command(cls, config_ini, write):
        common.load_config(config_ini)
        common.register_translator()

        def new_revision():
            rev = model.repo.new_revision()
            rev.author = 'script_delete_duplicate_datasets.py'
        if write:
            new_revision()

        publisher = model.Group.get(options.publisher)
        if publisher is None:
            print "Publisher could not be found"
            sys.exit(0)

        guids = defaultdict(list)
        for package in publisher.packages():
            guids[package.extras.get('guid')].append(package)

        for guid, packages in guids.items():
            if guid is None:
                for package in packages:
                    stats.add('Skip package not harvested', package.name)
                continue
            if len(packages) == 1:
                stats.add('Skip guid without duplicates', guid)
                continue

            best_name = None
            for i, package in enumerate(sorted(packages,
                                               key=lambda x: x.metadata_modified,
                                               reverse=options.keep_last)):
                if (not best_name or
                    len(package.name) < len(best_name) or
                    (len(package.name) == len(best_name) and
                     package.name < best_name)):
                        best_name = package.name

                if i == 0:
                    kept_package = package
                else:
                    stats.add('Deleting', package.name)
                    package.name = package.name + '_'
                    package.state = 'deleted'

            # Write the name changes, so that we can reuse the best_name.
            stats.add('Keep', '%s->%s' % (kept_package.name, best_name))
            if write:
                model.Session.commit()
                new_revision()
            kept_package.name = best_name

        if write:
            model.Session.commit()

        print stats.report()
Exemplo n.º 7
0
    def command(cls, config_ini):
        common.load_config(config_ini)
        common.register_translator()

        from ckanext.dgu.model.feedback import Feedback

        comment_hashes = []

        headers = ["user_id", "package_id", "timestamp", "title", "comment"]
        writer = csv.DictWriter(sys.stdout, headers)

        for fb in model.Session.query(Feedback)\
                .filter(Feedback.visible==True)\
                .filter(Feedback.active==True)\
                .order_by(Feedback.created):

            if not any([fb.economic, fb.social, fb.effective, fb.linked, fb.other]):
                stats.add('Missing any content', fb.id )
                continue

            user = model.User.get(fb.user_id)
            pkg = model.Package.get(fb.package_id)

            data = {
                u"timestamp": fb.created.isoformat(),
                u"package": pkg.name,
                u"item": fb
            }


            content = render_template(TEMPLATE, data)
            comment = content.replace(u'\r',u'').replace(u'\n',u'').replace(u'           ', u'')

            # Check for identical comments ... we want users duplicating comments on
            # the same package (by mistake most often).
            hashkey = u'{}.{}.{}'.format(comment, fb.package_id, fb.user_id).encode('utf8', 'ignore')
            comment_hash = hashlib.md5(hashkey).hexdigest()

            if comment_hash in comment_hashes:
                stats.add('Duplicate post', fb.id )
                continue

            comment_hashes.append(comment_hash)

            row = {
                u"user_id": user.name[len("user_d"):],
                u"package_id": pkg.name,
                u"timestamp": fb.created.isoformat(),
                u"title": "Feedback on the value of this dataset ",
                u"comment": comment.encode('utf-8', 'ignore')
            }
            writer.writerow(row)

            stats.add('Processed', fb.id )
Exemplo n.º 8
0
def test_init_missing_cert():
    """ Try to initialize the context with a nonexistant cert. """
    config = load_config()
    config['name'] = "failboat"
    config['sign_messages'] = True
    context = FedMsgContext(**config)
    context.publish(topic='awesome', msg=dict(foo='bar'))
Exemplo n.º 9
0
def main(username, password):
  global config
  config = common.load_config()

  common.mkdirs(config["jive-base"])
  os.chdir(config["jive-base"])

  l = common.Lock(".lock")

  global max_index
  try:
    with open(".max-index") as f:
      max_index = int(f.read())
  except IOError:
    pass

  tls.s = requests.Session()
  login(username, password)

  threads = int(config["jive-threads"])
  for i in range(threads):
    t = threading.Thread(target = worker, name = i, args = (tls.s.cookies, ))
    t.daemon = True
    t.start()

  for c in contents():
    q.put((iter_content, c))

  q.join()
  cleanup()
  common.write_sync_done()

  global index
  with open(".max-index", "w") as f:
    print(index, file = f)
Exemplo n.º 10
0
def install():
    config = load_config()
    try:
        run('ls blink')
    except:
        run('git clone ' + config.get('blink', 'repository'))
    configure()
Exemplo n.º 11
0
 def __init__(self, configfile):
     """
     Following class attributes initialized in __init__:
     nova - client for nova service
     keystone - client for keystone service
     ceilometer - client for ceilometer service
     hosts - all hosts for region
     servers - all servers in region
     """
     self.logger = logging.getLogger(__name__)
     self.config = load_config(configfile)
     if self.config:
         try:
             auth = v3.Password(username=self.config['username'],
                 password=self.config['password'],
                 project_name=self.config['tenant_name'],
                 auth_url=self.config['auth_url'],
                 user_domain_name=self.config['domain'],
                 project_domain_name=self.config['domain'],)
             self.session = session.Session(auth=auth, verify=self.config['cacert'])
             self.nova = client_nova.Client(2, session=self.session)
             self.keystone = client_keystone.Client(session=self.session)
             self.ceilometer = client_ceilometer.Client(2, session=self.session)
             self.glance = client_glance.Client(2, session=self.session)
             self.hosts = self.nova.hosts.list()
             self.servers = self.nova.servers.list(search_opts = { 'all_tenants': 1 })
             self.projects = self.getprojects()
             self.flavors = self.getflavors()
             self.connected = True
         except:
             self.logger.error('Error for authentication with credentials from ' + configfile)
             self.connected = False
Exemplo n.º 12
0
def main():
    global config
    config = common.load_config()

    common.mkdirs(config["product-docs-base"])
    os.chdir(config["product-docs-base"])

    l = common.Lock(".lock")

    get_dump()

    valid_files = set([".lock", ".sync-done"])
    pool = multiprocessing.Pool(processes=int(config["product-docs-threads"]))

    for x in iter_dump():
        x["product_"] = x["product"].replace("_", " ")

        url = "https://access.redhat.com/documentation/%(language)s/" \
              "%(product)s/%(version)s/pdf/%(name)s/" \
              "%(product)s-%(version)s-%(name)s-%(language)s.pdf" % x
        f = "%(product_)s/%(version)s/" \
            "%(product)s-%(version)s-%(name)s-%(language)s.pdf" % x

        pool.apply_async(download, (url, f))
        valid_files.add(f)

    pool.close()
    pool.join()

    remove_invalid_files(valid_files)
    common.write_sync_done()
Exemplo n.º 13
0
def main():
    global config
    config = common.load_config()

    # Permit write of UTF-8 characters to stderr (required when piping output)
    if sys.stderr.encoding == None:
        sys.stderr = codecs.getwriter("UTF-8")(sys.stderr)

    common.mkdirs(config["pt-base"])
    os.chdir(config["pt-base"])

    _lock = common.Lock(".lock")
    tls.s = requests.Session()

    login()

    for i in range(int(config["pt-threads"])):
        t = threading.Thread(target = worker, name = i, args = [tls.s.cookies])
        t.daemon = True
        t.start()

    read_project_list()
    while q.unfinished_tasks:
        time.sleep(1)
    q.join()

    cleanup()
    common.write_sync_done()
def xmpp_test():
    global conf
    if not conf:
        conf = load_config()
    if not push_in_queue(construct_message(conf['xmpp_recipients'], 'test message for: %s' % conf['xmpp_recipients'])):
        return '', 500
    return '', 204
Exemplo n.º 15
0
 def setUp(self):
     self.config = load_config()
     self.config['name'] = local_name
     self.config['mute'] = True
     self.config['persistent_store'] = Mock()
     self.replay_context = ReplayContext(**self.config)
     self.replay_thread = ReplayThread(self.replay_context)
     self.context = zmq.Context()
def prometheus_alert():
    global conf
    if not conf:
        conf = load_config()
    msg = PrometheusAlert(request.data.decode()).plain()
    html = PrometheusAlert(request.data.decode()).html()
    push_in_queue(construct_message(conf['xmpp_recipients'], msg, html))
    return '', 204
Exemplo n.º 17
0
 def setUp(self):
     self.config = load_config()
     self.config['name'] = local_name
     self.config['persistent_store'] = Mock()
     self.replay_context = ReplayContext(**self.config)
     self.request_context = zmq.Context()
     self.request_socket = self.request_context.socket(zmq.REQ)
     self.request_socket.connect(
         self.config['replay_endpoints'][local_name])
Exemplo n.º 18
0
    def setUp(self):
        config = load_config()
        self.hub = CentralMokshaHub(config=config)
        self.context = FedMsgContext(**config)

        # fully qualified
        self.fq_topic = "com.test_prefix.dev.unittest.foo"
        # short version
        self.topic = "foo"
Exemplo n.º 19
0
def add_instance(count):
    conn = connect()
    config = load_config()

    ami = config.get('aws', 'ami')
    spot_price = config.getfloat('aws', 'spot_price')
    key_name = config.get('aws', 'key_name')
    instance_type = config.get('aws', 'instance_type')
    availability_zone_group = config.get('aws', 'availability_zone_group')
    #placement = config.get('aws', 'placement')
    security_group = config.get('aws', 'security_group')

    create_ondemand_instances(conn, ami, security_group, instance_type, count, key_name)
Exemplo n.º 20
0
def test_init_invalid_endpoint():
    try:
        config = load_config()
        config['name'] = local_name
        config['persistent_store'] = Mock()
        tmp = zmq.Context()
        placeholder = tmp.socket(zmq.REP)
        placeholder.bind('tcp://*:{0}'.format(
            config["replay_endpoints"][local_name].rsplit(':')[-1]
        ))
        context = ReplayContext(**config)
    finally:
        placeholder.close()
Exemplo n.º 21
0
def connect():
    config = load_config() #ConfigParser.ConfigParser()

    aws_key = config.get('aws', 'aws_key')
    aws_secret = config.get('aws', 'aws_secret')

    conn = boto.ec2.connect_to_region(
        'us-east-1',
        aws_access_key_id = aws_key,
        aws_secret_access_key = aws_secret,
    )

    return conn
Exemplo n.º 22
0
def init():
    global config
    config = common.load_config()

    if config["thunderbird-base"] is None:
        return

    if isrunning():
        print >>sys.stderr, "thunderbird.py: thunderbird is running, disabling plugin"
        config["thunderbird-base"] = None
        return

    rmpath(config["thunderbird-folder"])
    mkpath(config["thunderbird-folder"])

    config["thunderbird-base"] = base(spd(config["thunderbird-folder"]))
def main():
	# fetch CLI arguments
	(
		dry_run,
		filter_include_list,
		filter_exclude_list
	) = common.read_arguments()

	# load config from file
	config_data = common.load_config()
	config_auth_token = config_data['AUTH_TOKEN']

	# fetch repository list and wiki status of the specified repository type
	print('Building repository list:')
	all_repository_set = get_repository_name_wiki_status_set(
		config_auth_token,
		config_data['REPOSITORY_TYPE'],
		common.RepositoryFilter(filter_include_list,filter_exclude_list)
	)

	# get total count, if zero then no work
	repository_count = len(all_repository_set)
	if (repository_count < 1):
		print('\nNo repositories for processing')
		return

	print('\nTotal repositories: {0}'.format(repository_count))

	# determine wiki enabled count
	wiki_enabled_repository_set = filter_repository_wiki_enabled(all_repository_set)
	wiki_enabled_count = len(wiki_enabled_repository_set)

	if (wiki_enabled_count < 1):
		# no projects enabled - no work
		print('All wikis disabled')
		return

	print('Wikis enabled: {0}'.format(wiki_enabled_count))

	# disable wikis (only simulation if dry run mode)
	print('\n\nDisabling wikis{0}:'.format(' [DRY RUN]' if (dry_run) else ''))

	for repository_name in wiki_enabled_repository_set:
		if (not dry_run):
			disable_repository_wiki(config_auth_token,repository_name)

		print(repository_name)
def main():
	# load config from file
	config_data = common.load_config(
		config_key_addition_set = { ORGANIZATION_CONFIG_KEY }
	)

	config_auth_token = config_data['AUTH_TOKEN']

	# fetch repository names/sizes of the specified type
	print('Building repository list ordered by size:')
	repository_list = get_organization_repository_size_sorted_list(
		config_auth_token,
		config_data[ORGANIZATION_CONFIG_KEY],
		config_data['REPOSITORY_TYPE']
	)

	# output list, repository URI/size - tab separated
	for repository_uri,repository_size in repository_list:
		print('{0}\t{1}'.format(repository_uri,repository_size))
def main():
    conf = load_config()
    bot = Bot(conf['xmpp_jid'], conf['xmpp_password'])
    if not timeout(bot.start, [conf['xmpp_host'], conf['xmpp_port']]):
        terminate()
    receiver = IPCReceiver(conf['mq_name'])
    try:
        while True:
            data = json.loads(receiver.receive())
            if 'html' in data:
                bot.send_message_to(data['message'], data['recipients'], html=data['html'])
            else:
                bot.send_message_to(data['message'], data['recipients'])
    except KeyboardInterrupt:
        pass
    except ipc.SignalError:
        pass
    finally:
        receiver.cleanup()
        terminate()
Exemplo n.º 26
0
def main(username, password):
  global config
  config = common.load_config()

  common.mkdirs(config["jive-base"])
  os.chdir(config["jive-base"])

  l = common.Lock(".lock")

  tls.s = requests.Session()
  login(username, password)

  threads = int(config["jive-threads"])
  for i in range(threads):
    t = threading.Thread(target = worker, name = i, args = (tls.s.cookies, ))
    t.daemon = True
    t.start()

  for c in contents():
    q.put((iter_content, c))

  q.join()
  cleanup()
  common.write_sync_done()
Exemplo n.º 27
0
def main():

    config = load_config()

    # TODO: break down into more functions?

    boto_session = get_boto_session(config)
    # TODO: if s3_resources continue to be used only to get bucket, merge get_s3_resource into get_s3_bucket. but wait
    #  until after any restructuring as class.
    s3_resource = get_s3_resource(boto_session)
    s3_bucket = get_s3_bucket(s3_resource, config)

    # get list of files in relevant dir of bucket
    # by converting to list we should trigger just one call to S3 API, unlike iterating over collection
    # see https://boto3.amazonaws.com/v1/documentation/api/latest/guide/collections.html#when-collections-make-requests
    objs = list(
        s3_bucket.objects.filter(
            Prefix=config['AWS']['s3_trips_prefix']).all())

    # use a pandas.DataFrame for convenient storage of metadata about the trip data csv files

    # collect metadata as (vertical) list of (horizontal) lists (rows), then construct DataFrame, for efficiency
    # (see "Notes" on
    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.append.html#pandas-dataframe-append)
    # parse filename into year, month, and trip type
    # unfortunately, the DataFrame constructor only accepts a singleton dtype argument and the default inference doesn't
    # preserve smaller numpy data types in the input. any improvement in the efficiency of the filtration and sorting
    # below probably not worth the cost of using astype() to force the series to convert after creation given relatively
    # small number of rows.
    trip_files = [parse_key(obj.key) for obj in objs]
    trip_files = DataFrame(trip_files,
                           columns=[col[0] for col in TRIPFILE_METADATA_COLS])

    # exclude undesired trip files now, using labeled columns, rather than the less convenient file names
    # for now, limit to those with usable lat and long columns: green and yellow, through the first half of 2016
    undesired_indices = trip_files[
        ~trip_files['Type'].isin(['green', 'yellow']) |
        (trip_files['Year'] > 2016) | ((trip_files['Year'] == 2016) &
                                       (trip_files['Month'] > 6))].index
    trip_files.drop(undesired_indices, axis=0, inplace=True)

    # assuming pandas isn't optimized enough under the hood that sorting first would speed the lookups involved in the
    # drops, more efficient to reduce dataset size before sorting
    trip_files.sort_values(['Year', 'Month'],
                           axis=0,
                           ascending=True,
                           inplace=True)

    if TESTING and TESTING_CSV_LIMIT is not None:
        trip_files = [trip_files.head(min(len(trip_files), TESTING_CSV_LIMIT))]

    # separate trip_file entries into separate pandas DataFrames for each time period (year, for now) and store them
    # chronologically in time_period_tables list
    # note that groupby guarantees the sort order done above will be preserved within each group — see
    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html#pandas-dataframe-groupby
    year_groups = trip_files.groupby(['Year'])
    time_period_tables = [
        year_groups.get_group(year) for year in year_groups.groups
    ]

    # simulate the time periods in sequence
    [
        simulate_time_period(time_period_table, config)
        for time_period_table in time_period_tables
    ]
Exemplo n.º 28
0
#!/usr/bin/env python3
import argparse
from pprint import pprint

from common import load_config
from common import encode
from common import send_msg


def opt_parser():
    parser = argparse.ArgumentParser(
        description='Network reconfiguration node')
    parser.add_argument('--net_config',
                        default='config/sample_graph3.json',
                        type=str)
    return parser


if __name__ == '__main__':
    parser = opt_parser()
    opt = parser.parse_args()

    config = load_config(opt.net_config)
    pprint(config)

    start_msg = encode(dict(type="start"))
    for node in config.values():
        send_msg(start_msg, host=node['host'], port=node['port'])
        print(node)
Exemplo n.º 29
0
base_dir = os.path.dirname(os.path.abspath(__file__))
script_dir = os.path.join(base_dir, 'Common')

sys.path.insert(0, script_dir)

import common

from polyglotdb.client.client import PGDBClient

token = common.load_token()

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('corpus_name', help='Name of the corpus')

    args = parser.parse_args()
    corpus_name = args.corpus_name
    directories = [
        x for x in os.listdir(base_dir) if os.path.isdir(x) and x != 'Common'
    ]

    if args.corpus_name not in directories:
        print(
            'The corpus {0} does not have a directory (available: {1}).  Please make it with a {0}.yaml file inside.'
            .format(args.corpus_name, ', '.join(directories)))
        sys.exit(1)
    corpus_conf = common.load_config(corpus_name)
    print('Processing...')
    client = PGDBClient('http://localhost:{}'.format(8080), token=token)
    client.delete_database(corpus_name)
Exemplo n.º 30
0
import os
import re
import glob
from openpyxl import Workbook

from formula.parser import FormulaParser
from settings import BASE_DIR
from common import load_config

CONFIG = load_config()
WORK_ORDERS_PATH = os.path.join(BASE_DIR, CONFIG.get('default', 'formula_dir'))


def extract_viscosity():
    """提取粘度数据"""
    all_formula_files = glob.glob(f"{WORK_ORDERS_PATH}/**/*.xlsx",
                                  recursive=True)

    result = []
    for filepath in all_formula_files:
        parser = FormulaParser(filepath)
        formulas = parser.parse()
        for formula in formulas:
            after_adding_requirement = formula['metas'][
                'after_adding_requirement']
            if isinstance(after_adding_requirement, list):
                for requirement in after_adding_requirement:
                    if requirement.find("粘度要求") >= 0:
                        result.append(
                            dict(name=formula['name'], viscosity=requirement))
            elif after_adding_requirement and isinstance(
Exemplo n.º 31
0
                self.signal_hup = 0

            ts_msg = TsMessage(_ts, _msg)
            ts_msg.print_raw()

            self.msgs_curr_total = self.msgs_curr_total + 1


def handler_atexit():
    log("Terminated")


if __name__ == "__main__":
    atexit.register(handler_atexit)

    log("Running: Pid {:5d}".format(os.getpid()))

    params = {}
    load_config(params, "config/config.txt")

    host = params['addr']
    port = params['port']
    type = params['type']

    print(params['vers'], params['name'])

    client = MyClient(_host=host, _port=port, _datatype=type)
    client.run()

    sys.exit(0)
Exemplo n.º 32
0
    def command(cls, config_ini, dataset_names, options):
        common.load_config(config_ini)
        common.register_translator()

        from pylons import config
        apikey = config['dgu.merge_datasets.apikey']
        ckan = ckanapi.RemoteCKAN('https://data.gov.uk', apikey=apikey)
        #ckan = ckanapi.LocalCKAN()

        if options.publisher:
            org_name = common.name_stripped_of_url(options.publisher)
            if options.search:
                results = ckan.action.package_search(q=options.search,
                                                     fq='publisher:%s' %
                                                     org_name,
                                                     rows=100,
                                                     escape_q=False)
                dataset_names.extend(
                    [dataset['name'] for dataset in results['results']])
            else:
                org = ckan.action.organization_show(id=org_name,
                                                    include_datasets=True)
                dataset_names.extend([d['name'] for d in org['packages']])

        datasets = []
        datasets_by_name = {}

        def get_extra(dataset, key):
            for extra in dataset['extras']:
                if extra['key'] == key:
                    return extra['value']

        for dataset_name in dataset_names:
            print 'Dataset: %s' % dataset_name
        for dataset_name in dataset_names:
            # strip off the url part of the dataset name, if there is one
            dataset_name = common.name_stripped_of_url(dataset_name)
            dataset = ckan.action.package_show(id=dataset_name)
            harvest_source_ref = get_extra(dataset, 'harvest_source_reference')
            if harvest_source_ref:
                print '** Discarding dataset %s due to harvest source: %s **' \
                    % (dataset_name, harvest_source_ref)
                continue
            datasets.append(dataset)
            datasets_by_name[dataset['name']] = dataset
        datasets.sort(key=lambda x: x['metadata_modified'])

        # aggregate resources
        def resource_identity(res_dict, dataset_name):
            return (res_dict.get('date'), res_dict['url'],
                    res_dict.get('title') or res_dict['description'],
                    res_dict.get('format'), dataset_name)

        combined_resources = {}  # identity
        res_stats = Stats()
        for dataset in datasets:
            for resource in dataset['resources']:
                identity = resource_identity(resource, dataset['name'])
                resource['dataset_name'] = dataset['name']
                if identity in combined_resources:
                    print res_stats.add(
                        'Discarding duplicate', '\n%s duplicate of \n%s' %
                        (resource, combined_resources[identity]))
                else:
                    combined_resources[identity] = resource
        resources = combined_resources.values()

        # find dates for resources
        if options.frequency:
            url_munge_re = re.compile('(%20|-|_|\.)')

            def fields_to_hunt_for_date(res):
                date = res.get('date')
                if date:
                    yield 'date', date
                title = res.get('title')
                if title:
                    yield 'title', title
                yield 'description', res['description']
                yield 'url', url_munge_re.sub(' ', res['url'])
                if not options.update:
                    dataset = datasets_by_name[res['dataset_name']]
                    yield 'dataset-title', dataset['title']
                    yield 'dataset-notes', dataset['notes']

            ensure_regexes_are_initialized()
            global regexes
            for resource in resources:
                for field_name, field_value in fields_to_hunt_for_date(
                        resource):
                    if options.frequency in ('monthly', 'quarterly',
                                             'twice annually'):
                        month, year = hunt_for_month_and_year(field_value)
                        if year and month:
                            resource['date'] = '%02d/%s' % (month, year)
                            res_stats.add(
                                'Found date in %s' % field_name,
                                '%s %r' % (resource['date'], resource))
                            if resource.get(
                                    'resource_type') == 'documentation':
                                resource['resource_type'] = 'file'
                                res_stats.add('Converted additional resource',
                                              resource)
                            break
                    elif options.frequency == 'annually':
                        year = regexes['year'].search(field_value)
                        if year:
                            resource['date'] = year.groups()[0]
                            res_stats.add(
                                'Found date in %s' % field_name,
                                '%s %r' % (resource['date'], resource))
                            if resource.get(
                                    'resource_type') == 'documentation':
                                resource['resource_type'] = 'file'
                                res_stats.add('Converted additional resource',
                                              resource)
                            break
                else:
                    if resource.get('resource_type') == 'documentation':
                        print res_stats.add(
                            'Could not find date but it\'s Additional Resource',
                            resource)
                        continue
                    print res_stats.add('Could not find date', resource)
                    continue

            print 'Resources: \n', res_stats

            resources_without_date = [
                res for res in resources if not res.get('date')
                and res.get('resource_type') != 'documentation'
            ]
            for i, res in enumerate(resources_without_date):
                print 'Resources without dates %s/%s' % (
                    i + 1, len(resources_without_date))
                for field_name, field_value in fields_to_hunt_for_date(res):
                    print '  %s: %s' % (
                        field_name, field_value.encode('latin-1', 'ignore'))
                print 'https://data.gov.uk/dataset/%s/resource/%s' % (
                    res['dataset_name'], res['id'])
                date_format = {
                    'annually': 'YYYY',
                    'monthly': 'MM/YYYY',
                    'twice annually': 'MM/YYYY',
                    'quarterly': 'MM/YYYY'
                }
                input_ = raw_input(
                    'Date (%s) or DOCS to make it an Additional Resource: ' %
                    date_format[options.frequency])
                if input_.strip().lower() == 'docs':
                    res['date'] = ''
                    res['resource_type'] = 'documentation'
                else:
                    res['date'] = input_

            resources.sort(key=lambda x: x.get('date', '').split('/')[::-1])

        # Ensure there is not a mixture of resources with and without a date
        have_dates = None
        for res in resources:
            if res.get('resource_type') == 'documentation':
                continue
            if have_dates is None:
                have_dates = bool(res.get('date'))
            else:
                has_date = bool(res.get('date'))
                if has_date != have_dates:
                    print[res.get('date') for res in resources]
                    print 'Cannot mix resources with dates and others without!'
                    import pdb
                    pdb.set_trace()

        # Remove 'dataset_name' and others fields from resources
        ignore_res_fields = set(('dataset_name', 'created', 'position',
                                 'revision_id', 'id', 'tracking_summary'))
        for res in resources:
            for field in ignore_res_fields & set(res.keys()):
                del res[field]

        # Merge dataset fields
        def get_all_fields_and_values(datasets):
            ignore_fields = set((
                'id',
                'resources',
                'last_major_modification',
                'data_dict',
                'revision_timestamp',
                'num_tags',
                'metadata_created',
                'metadata_modified',
                'odi_certificate',
                'extras',  # they are at top level already
                'timeseries_resources',
                'individual_resources',
                'additional_resources',
                'revision_id',
                'organization',
                'tracking_summary',
                'num_resources',
                'license_title',
                'author',
                'author_email',
                'maintainer',
                'maintainer_email',
                'temporal_granularity',
                'geographic_granularity',
                'state',
                'isopen',
                'url',
                'date_update_future',
                'date_updated',
                'date_released',
                'precision',
                'taxonomy_url',
                'temporal_coverage-from',
                'temporal_coverage-to',
                'published_via',
                'creator_user_id',
            ))
            first_fields = [
                'title', 'name', 'notes', 'theme-primary', 'theme-secondary'
            ]
            all_field_values = defaultdict(list)
            for dataset in datasets:
                for field in dataset:
                    if field not in ignore_fields and dataset[field]:
                        all_field_values[field].append(dataset[field])
            for field in first_fields:
                yield field, all_field_values.get(field, [])
            for field in all_field_values:
                if field not in first_fields:
                    yield field, all_field_values[field]

        spend_data_defaults = {
            'geographic_coverage': None,
            'theme-primary': 'Government Spending',
            'theme-secondary': None,
            'update_frequency': 'monthly',
        }
        combined_dataset = {'resources': resources}
        all_fields_and_values = get_all_fields_and_values(datasets)
        for field, values in all_fields_and_values:
            if field == 'notes':
                values = [value.strip() for value in values]
            if field == 'tags':
                # just merge them up-front and
                # dont offer user any choice
                tags_by_name = {}
                for dataset_tags in values:
                    for tag in dataset_tags:
                        if tag['name'] not in tags_by_name:
                            tags_by_name[tag['name']] = tag
                values = [tags_by_name.values()]
            if field in ('codelist', 'schema'):
                # just merge them up-front
                # And convert the dict into just an id string
                ids = set()
                for dataset_values in values:
                    for value_dict in dataset_values:
                        ids.add(value_dict['id'])
                values = [list(ids)]
            print '\n%s:' % field
            pprint(list(enumerate(values)))
            if options.spend and field in spend_data_defaults:
                value = spend_data_defaults[field]
                print 'Spend data defaults to: %s' % value
                values = [value] if value is not None else None
            # dont be case-sensitive for boolean fields
            if field == 'core-dataset':
                values = [v.lower() for v in values]
            try:
                values_identicle = len(set(values)) == 1
            except TypeError:
                if values and len(values):
                    val1 = values[0]
                    for val in values[1:]:
                        if val != val1:
                            values_identicle = False
                            break
                    else:
                        values_identicle = True
            if (not values) or (not len(values)):
                pass
            elif values_identicle:
                value = values[0]
            elif field == 'name':
                while True:
                    from ckan.lib.munge import munge_title_to_name
                    munged_title = munge_title_to_name(
                        combined_dataset['title'])
                    print munge_title_to_name(
                        datasets[0]['organization']['title'])
                    value = raw_input('Type new value (%s): ' % (munged_title))
                    if not value:
                        value = munged_title
                    if len(value) < 3:
                        print 'Too short'
                        continue
                    if value in values:
                        print 'That name is taken'
                        continue
                    existing = ckan.action.package_autocomplete(q=value)
                    if value in existing:
                        print 'That name is taken on CKAN'
                        continue
                    break
            else:
                while True:
                    response = raw_input(
                        '%s: value (number) or type new one: ' % field)
                    try:
                        value_index = int(response)
                        value = values[value_index]
                        print value
                    except ValueError:
                        # fix pound signs if the user pasted from the repr'd version
                        response = re.sub(r'\\xa3', u'\xa3', response)
                        value = response
                    if not value and field in ('title', 'owner_org', 'notes',
                                               'license_id'):
                        print 'You must have a value for this field!'
                        continue
                    break
            if value:
                combined_dataset[field] = value

        # Store
        print '\nMerged dataset:\n'
        pprint(combined_dataset)

        response = raw_input(
            'Press enter to write or pdb to edit in pdb first: ')
        if response == 'pdb':
            import pdb
            pdb.set_trace()
        try:
            if options.update:
                ckan.action.dataset_update(**combined_dataset)
            else:
                ckan.action.dataset_create(**combined_dataset)
        except Exception, e:
            print e
            import pdb
            pdb.set_trace()
Exemplo n.º 33
0
def test_init_missing_endpoint():
    """ Try to initialize the context with a nonexistant service name. """
    config = load_config()
    config['name'] = "failboat"
    config['sign_messages'] = True
    context = FedMsgContext(**config)
Exemplo n.º 34
0
    ap.add_argument("list", nargs="?")

    return vars(ap.parse_args())


def isgzip(f):
    bytes = f.read(2)
    f.seek(0)

    return bytes == b"\x1F\x8B"


if __name__ == "__main__":
    warnings = 0
    global config
    config = common.load_config()
    args = parse_args()

    if args["quiet"]:
        common.progress = lambda x, y: None
        common.progress_finish = lambda: None

    if not config["lists-sync"]:
        print("Please configure lists in $HOME/.satools before running %s." %
              sys.argv[0],
              file=sys.stderr)
        sys.exit(1)

    common.mkdirs(config["lists-base"])
    os.chdir(config["lists-base"])
Exemplo n.º 35
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--no-download', action='store_true')
    parser.add_argument('--no-remux', action='store_true')
    parser.add_argument('--no-upload', action='store_true')
    parser.add_argument('--no-notify', action='store_true')
    parser.add_argument('--no-delete', action='store_true')
    parser.add_argument('--force', action='store_true')
    parser.add_argument('--force-log-to-file', action='store_true')
    parser.add_argument('--override-channel-name')
    parser.add_argument('--override-video-name')
    parser.add_argument('video_id')

    args = parser.parse_args()
    config = load_config()

    log_filename = LOGDIR / f'{args.video_id}-{os.getpid()}.log' if not sys.stdout.isatty(
    ) or args.force_log_to_file else None
    # For subprocess
    log_file = log_filename.open('a') if log_filename else None
    if log_file:
        sys.stderr = sys.stdout = log_file

    setup_logging(filename=log_filename)

    log.info(f'Starting download for {args.video_id}')

    pid_exists, active_downloaders = check_pid(args.video_id)

    if pid_exists and not args.force:
        raise ValueError('Another downloader is still alive, exiting')
    else:
        active_downloaders[args.video_id] = os.getpid()
        with open_state() as state:
            state['active_downloaders'] = active_downloaders

    if args.override_channel_name and args.override_video_name:
        log.info(
            'Using overridden channel and video name, setting is_upcoming to false'
        )
        channel_name = args.override_channel_name
        video_name = args.override_video_name
        # There's no reason to use these overrides for an upcoming video
        is_upcoming = False
    else:
        player_response = get_video_info(args.video_id)
        if 'videoDetails' not in player_response:
            log.error(f'{args.video_id} has no details, cannot proceed '
                      '(playability: {}, {})'.format(
                          player_response["playabilityStatus"]["status"],
                          player_response["playabilityStatus"]["reason"],
                      ))
            sys.exit(1)
        else:
            channel_name = player_response['videoDetails']['author']
            video_name = player_response['videoDetails']['title']
            is_upcoming = player_response['videoDetails'].get(
                'isUpcoming', False)

    log.info(f'Channel: {channel_name}')
    log.info(f'Title: {video_name}')
    log.info(f'Upcoming: {is_upcoming}')

    if is_upcoming:
        wait(player_response, config)

    filename_base = sanitize_filename(video_name)
    log.info(f'Filename base: {filename_base}')

    # Copy youtube-dl's naming scheme
    filepath_streamlink = WORKDIR / f'{filename_base}-{args.video_id}.ts'

    # TODO: If file already exists, rename it and concatenate it later?

    # XXX: youtube-dl used to be less reliable than streamlink for downloading
    # streams - that may no longer be the case.

    # XXX: Invoke this in a less hacky manner
    # The reason for doing this is that I wanted to use streamlink
    # inside the venv but in a separate process,
    # without hardcoding the path of the venv.
    streamlink_args = [
        '--force',  # Overwrite any existing file
        '--hls-timeout',
        '60',
        # XXX: This doesn't work right now!
        # See https://github.com/streamlink/streamlink/issues/2936
        '--hls-live-restart',
        '--retry-streams',
        '10',
        '--retry-max',
        '10',
        '-o',
        str(filepath_streamlink),
        f'https://www.youtube.com/watch?v={args.video_id}',
        'best',
    ]

    if not args.no_download:
        log.info(f'Starting streamlink with args: {streamlink_args}')
        fork_return = os.fork()
        if fork_return == 0:
            sys.argv = streamlink_args
            streamlink_main()
        else:
            os.wait()
    else:
        log.info('Skipping download')

    filename_output = f'{filename_base}-{args.video_id}.mp4'
    filepath_output = WORKDIR / filename_output
    ffmpeg_args = (
        'ffmpeg',
        '-y',
        '-i',
        filepath_streamlink,
        '-c',
        'copy',
        '-movflags',
        'faststart',
        '-metadata',
        f'title={video_name}',
        '-metadata',
        f'artist={channel_name}',
        '-metadata',
        f'comment=https://www.youtube.com/watch?v={args.video_id}',
        filepath_output,
    )
    if not args.no_remux:
        log.info('Remuxing to mp4')
        subprocess.run(ffmpeg_args, stdout=log_file)
    else:
        log.info('Skipping remux')

    # Upload
    if not args.no_upload:
        link_url, thumbnail = upload(
            sanitize_filename(channel_name),
            # This argument duplication is kind of silly...
            filename_output,
            filepath_output,
        )

        # We won't have link and thumb if not uploading without
        # going through a bunch more effort.
        if not args.no_notify:
            notify(
                channel_name,
                video_name,
                link_url,
                thumbnail,
            )
        else:
            log.info('Skipping notify')
    else:
        log.info('Skipping upload')

    if not args.no_delete:
        log.info('Deleting work files')
        filepath_streamlink.unlink()
        filepath_output.unlink()

        log.info('Cleaning up state')
        with open_state() as state:
            active_downloaders = state.get('active_downloaders', {})
        active_downloaders.pop(args.video_id, None)
    else:
        log.info('Skipping cleanup')

    log.info('All done!')
Exemplo n.º 36
0
import os
from os import environ

from common import load_config

load_config(environ.get('PYTHONPATH', ''))


class Config:
    SECRET_KEY = os.urandom(24)

    SESSION_COOKIE_NAME = environ.get('SESSION_COOKIE_NAME')
    SESSION_COOKIE_SECURE = True

    SQLALCHEMY_DATABASE_URI = environ.get('SQLALCHEMY_DATABASE_URI')
    SQLALCHEMY_ECHO = False
    SQLALCHEMY_TRACK_MODIFICATIONS = False


class ProdConfig(Config):
    FLASK_ENV = 'production'
    DEBUG = False
    TESTING = False


class DevConfig(Config):
    FLASK_ENV = 'development'
    DEBUG = True
    TESTING = True
Exemplo n.º 37
0
def fixup_versions(config: dict = None):
    if config is None:
        config = common.load_config()

    import subprocess
    import glob
    import mmap
    import sys

    root = common.root_dir()
    script_dir = pathlib.Path(__file__).absolute().parent
    res_hacker = glob.glob(f"{root}/packages/**/ResourceHacker.exe",
                           recursive=True)[0]

    data_dir = common.unity_data_dir(config["unityBuildDir"])
    if data_dir is None:
        print("No unity build dir found", file=sys.stderr)
        return

    plugins_dir = data_dir / "Managed"
    with open(script_dir / "version.rc.in", "r", newline="") as file:
        res = file.read()
    version_rc = script_dir / "version.rc"
    version_res = version_rc.with_suffix(".res")

    version = "1.0.0.0"
    for plugin_dll in config["invalidFileVersion"]:
        plugin = plugins_dir / plugin_dll

        with open(version_rc, "w") as file:
            file.write(res.format(name=plugin_dll, version=version))

        subprocess.check_call([
            res_hacker,
            "-open",
            version_rc,
            "-save",
            version_res,
            "-action",
            "compile",
            "-log",
            "CON",
        ])

        # damn ResourceHacker puts invalid length bytes as far .NET is concerned,
        #  doesn't seem to affect any other tools though
        fileversion_bytes = bytearray.fromhex(
            "46 00 69 00 6c 00 65 00 56 00 65 00 72 00 73 00 69 00 6F 00 6E 00"
        )
        with open(version_res, "r+b") as f:
            with mmap.mmap(f.fileno(), 0) as mm:
                index = -1
                while True:
                    index = mm.find(fileversion_bytes, index + 1)
                    if index == -1:
                        break
                    # +1 for the null character I guess
                    len_str = str(hex(len(version) + 1))[2:]
                    if len(len_str) == 1:
                        len_str = "0" + len_str
                    assert len(len_str) == 2
                    mm[index - 4] = bytearray.fromhex(len_str)[0]

        subprocess.check_call([
            res_hacker,
            "-open",
            plugin,
            "-save",
            plugin,
            "-action",
            "addskip",
            "-res",
            version_res,
            "-log",
            "CON",
        ])

        print(f"{plugin}: fixed version resource")
Exemplo n.º 38
0
def main():
    global conf
    conf = load_config()
    service.run(host=conf['flask_host'], port=conf['flask_port'])
Exemplo n.º 39
0
 def __init__(self):
     self.params = {}
     load_config(self.params, "config/config.txt")
     return
Exemplo n.º 40
0
def get_cache_status(archival):
    if not archival.cache_filepath:
        return 'Not cached'
    if os.path.exists(archival.cache_filepath):
        return 'Cached'
    return 'Cache missing on disk!'

if __name__ == '__main__':
    usage = __doc__ + """
usage:

%prog [-w] <ckan.ini>
"""
    parser = OptionParser(usage=usage)
    parser.add_option("-w", "--write",
                      action="store_true", dest="write",
                      help="write the theme to the datasets")
    parser.add_option('-d', '--dataset', dest='dataset')
    parser.add_option('-r', '--resource', dest='resource')
    parser.add_option('-o', '--organization', dest='organization')
    (options, args) = parser.parse_args()
    if len(args) != 1:
        parser.error('Wrong number of arguments (%i)' % len(args))
    config_filepath = args[0]
    print 'Loading CKAN config...'
    common.load_config(config_filepath)
    common.register_translator()
    print 'Done'
    fix_links(options)
Exemplo n.º 41
0
if __name__ == '__main__':
    USAGE = '''Daily script for government
    Usage: python %s <config.ini> [task]

    Where:
       [task] - task to run (optional), picked from:
                %s
                or run multiple by separating by a comma.
    ''' % (sys.argv[0], ','.join(TASKS_TO_RUN))

    if set(sys.argv) & set(('--help', '-h')):
        print USAGE
        sys.exit(1)
    if len(sys.argv) < 2:
        err = 'Error: Please specify config file.'
        print USAGE, err
        logging.error('%s' % err)
        sys.exit(1)
    config_file = sys.argv[1]
    config_ini_filepath = os.path.abspath(config_file)

    if len(sys.argv) == 3:
        TASKS_TO_RUN = sys.argv[2].split(',')

    load_config(config_ini_filepath)
    register_translator()
    logging.config.fileConfig(config_ini_filepath)

    command(config_file)
Exemplo n.º 42
0
        if admin_count:
            print "    -> ",
            print ', '.join(
                u.name
                for u in group.members_of_type(model.User, 'admin').all())

        editor_count = group.members_of_type(model.User, 'editor').count()
        print "Editors: {uc}".format(uc=editor_count)
        if editor_count:
            print "    -> ",
            print ', '.join(
                u.name
                for u in group.members_of_type(model.User, 'editor').all())

        print "Dataset count: {dc}".format(
            dc=group.members_of_type(model.Package).count())


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('config', help='CKAN config .ini filepath')
    parser.add_argument('old_name', metavar='old_name')
    parser.add_argument('new_name', metavar='new-name')
    parser.add_argument('-t', '--title', help='Title to set')
    args = parser.parse_args()

    common.load_config(args.config)
    common.register_translator()

    PublisherRenamer().rename(args.old_name, args.new_name, args.title)
Exemplo n.º 43
0
def start(demo=False):
    if demo:
        # load demo options. it will escape config file.
        cookie_secret = common.hmacstr(common.randomstr(), common.randomstr())
        args = [
            sys.argv[0], "--debug", "--host=0.0.0.0", "--port=8080",
            "--base_url=/ssweb", "--service_name=shadowsocks",
            "--cookie_secret=" + cookie_secret, "--logging=debug"
        ]
        options.parse_command_line(args)
    else:
        # pre-parse the command line options. it will be over write by 'load
        # options from config file'. by then, it yet loaded.
        options.parse_command_line()

        if options.config is not None:
            # load options from specified config file
            if not os.path.isfile(options.config):
                err_("Can't find config file '%s'." % options.config)
                exit(1)
            else:
                config = common.load_config(options.config)
                if config is not None:
                    info_("Load config from file '%s'." % options.config)
                    args = [sys.argv[0]]
                    for item in config:
                        args += ["--%s=%s" % (item, config[item])]
                    try:
                        options.parse_command_line(args)
                    except tornado.options.Error:
                        err_("Error on config file option.")
                        sys.exit(1)
        else:
            # load options from config file, if the file exists.
            config_file = common.find_config_file()
            if config_file is not None:
                config = common.load_config(config_file)
                if config is not None:
                    info_("Load config from file '%s'." % config_file)
                    args = [sys.argv[0]]
                    for item in config:
                        args += ["--%s=%s" % (item, config[item])]
                    try:
                        options.parse_command_line(args)
                    except tornado.options.Error:
                        err_("Error on config file option.")
                        sys.exit(1)

        # load options from command line
        try:
            options.parse_command_line()
        except tornado.options.Error:
            err_("Error on command line option.")
            sys.exit(1)
    debug_("options: %s" % json.dumps(options.as_dict(), sort_keys=True))
    logging.debug("options: %s" %
                  json.dumps(options.as_dict(), sort_keys=True))

    # load shadowsocks configuration
    ss_config_filename = common.find_shadowsocks_config_file()
    if ss_config_filename is None:
        err_("Can't find any shadowsocks config file. Are you sure there "
             "installed shadowsocks already?")
        exit(1)
    config = common.load_shadowsocks_config(ss_config_filename)
    info_("Loading shadowsocks config from file '%s'." % ss_config_filename)
    start_tornado(config, ss_config_filename)
    global_log.warn(msg, *params)


def usage():
    print """
Imports publishers from the specified CSV file.
Usage:

  python publisher_categories.py <CKAN config ini filepath> export pub_cats.csv
    - produces a list of publishers and their categories

  python publisher_categories.py <CKAN config ini filepath> import pub_cats.csv
    - import an amended list of publishers and their categories
    """

if __name__ == '__main__':
    if len(sys.argv) != 4:
        print 'Wrong number of arguments %i' % len(sys.argv)
        usage()
        sys.exit(0)
    cmd, config_ini, action, filepath = sys.argv
    common.load_config(config_ini)
    PublisherCategories.setup_logging(config_ini)
    common.register_translator()
    if action == 'export':
        PublisherCategories.export(filepath)
    elif action == 'import':
        PublisherCategories.import_(filepath)
    else:
        raise NotImplementedError
Exemplo n.º 45
0
def transfer(read_from, save_to):
    click.echo('%s --> %s' % (read_from, save_to))
    if read_from not in OPTIONS or save_to not in OPTIONS:
        print 'Should be %s or %s' % (LOCAL, FIREBASE)
        sys.exit(-1)
    if read_from == save_to:
        print 'Saving data to where it is from does not make sense.'
        sys.exit(-2)

    click.echo('This will OVERWRITE data in "%s". Are you sure? [y/N]' %
               save_to)
    confirm = sys.stdin.readline()
    if confirm.strip() != 'y':
        print 'byebye~'
        return

    common.READ_FROM = common.LOCAL if read_from == LOCAL else common.FIREBASE
    common.SAVE_TO = (common.LOCAL,)\
        if save_to == LOCAL else (common.FIREBASE,)

    print 'Transfering catalog...'
    catalog = common.load_catalog()
    common.save_catalog(catalog)

    print 'Transfering categories...'
    catalog = common.load_catalog()
    categories = common.load_categories()
    common.save_categories(categories)

    print 'Transfering filter results...'
    f_results = common.load_filter_results()
    common.save_filter_results(f_results)

    print 'Transfering indicator results...'
    i_results = common.load_indicator_results()
    common.save_indicator_results(i_results)

    print 'Transfering config...'
    config = common.load_config()
    common.save_config(config)

    todo = []
    for stocks in catalog.values():
        todo.extend(stocks)
    total = len(todo)
    print 'Transfering sotcks...'
    widgets = [
        FormatLabel(
            'Processed: %(value)d / {0} (in: %(elapsed)s)'.format(total))
    ]
    pbar = ProgressBar(widgets=widgets, maxval=total)
    count = 0
    pbar.start()
    for s in todo:
        data = common.load_stock(s)
        common.save_stock(s, data)
        pbar.update(count)
        count += 1
    pbar.finish()

    print 'Transfering state...'
    catalog = common.load_catalog()
    state = common.load_state()
    common.save_state(state)
#!/usr/bin/python
import sys
import logging
import time

import common
import message_consumer

logger = logging.getLogger(__name__)

if __name__ == '__main__':
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    logging.getLogger("pika").setLevel(logging.ERROR)
    logging.getLogger("requests").setLevel(logging.WARNING)
    logging.getLogger("urllib3").setLevel(logging.WARNING)

    common.load_config()

    while True:
        try:
            message_consumer.initiate_consumer()
        except Exception as e:
            logger.error(e)
            time.sleep(60)
Exemplo n.º 47
0
    def command(cls, config_ini, options):
        common.load_config(config_ini)
        common.register_translator()

        from ckan import model
        from ckanext.dgu.lib.theme import (categorize_package, PRIMARY_THEME,
                                           SECONDARY_THEMES)
        rev = model.repo.new_revision()
        rev.author = 'script-fix_themes.py'

        datasets = common.get_datasets(state='active',
                                       dataset_name=options.dataset,
                                       organization_ref=options.organization)

        def fix_theme(theme_str):
            '''Returns (fixed_theme_str, outcome)'''
            if not theme_str:
                return '', 'Blank'
            elif theme_str == 'null':
                return '', '"null"->""'
            elif theme_str in THEMES:
                return theme_str, 'Ok'
            else:
                fixed_theme = THEME_MAP.get(theme_str)
                if fixed_theme is None:
                    return theme_str, 'Unknown theme %s - recategorizing' % theme_str
                else:
                    assert (fixed_theme != theme_str)
                    return fixed_theme, 'Changed to long form'
                    package.extras[PRIMARY_THEME] = new_primary

        def recategorize(pkg):
            themes = categorize_package(pkg, stats_recategorize)
            print 'Recategorize: %s' % themes
            if themes:
                pkg.extras[PRIMARY_THEME] = themes[0]
            elif PRIMARY_THEME in pkg.extras:
                pkg.extras[PRIMARY_THEME] = ''
            if len(themes) > 1:
                pkg.extras[SECONDARY_THEMES] = '["%s"]' % themes[1]
            elif SECONDARY_THEMES in pkg.extras:
                pkg.extras[SECONDARY_THEMES] = '[]'

        for package in datasets:
            if PRIMARY_THEME in package.extras:
                primary = package.extras.get(PRIMARY_THEME)
                new_primary, outcome = fix_theme(primary)
                if new_primary != primary:
                    package.extras[PRIMARY_THEME] = new_primary
                output = stats_primary.add(outcome, package.name)
                if outcome != 'Ok':
                    print output
                if outcome.startswith('Unknown theme'):
                    recategorize(package)
                    continue
            else:
                stats_primary.add('No theme', package.name)

            if SECONDARY_THEMES in package.extras:
                secondary = package.extras.get(SECONDARY_THEMES)
                try:
                    secondary = json.loads(secondary)
                except ValueError:
                    if secondary.startswith('{') and secondary.endswith('}'):
                        # '{Crime}' -> 'Crime'
                        secondary = secondary[1:-1].strip('\"')
                        print stats_secondary.add('Tidied {}', package.name)
                    else:
                        print stats_secondary.add('Error decoding JSON',
                                                  package.name)

                if secondary == {}:
                    secondary = []

                new_secondary = []
                do_recategorize = False

                if not isinstance(secondary, list):
                    secondary = [secondary]
                for theme_str in secondary:
                    if not isinstance(theme_str, basestring):
                        print stats_secondary.add(
                            'Not a list of strings %s' % type(theme_str),
                            package.name)
                        continue
                    new_theme, outcome = fix_theme(theme_str)
                    if new_theme:
                        new_secondary.append(new_theme)
                    if outcome != 'Ok':
                        print stats_secondary.add(outcome, package.name)
                    if outcome.startswith('Unknown theme'):
                        do_recategorize = True
                if do_recategorize:
                    recategorize(package)
                    continue
                if json.dumps(new_secondary) != package.extras.get(
                        SECONDARY_THEMES):
                    stats_secondary.add('Fixed', package.name)
                    package.extras[SECONDARY_THEMES] = json.dumps(
                        new_secondary)
                else:
                    stats_secondary.add('Ok', package.name)
            else:
                stats_secondary.add('No theme', package.name)

            if 'themes-secondary' in package.extras:
                print stats_secondary.add(
                    'Old key removed: themes-secondary', '%s %s' %
                    (package.name, package.extras['themes-secondary']))
                del package.extras['themes-secondary']

        print "\nPrimary theme:"
        print stats_primary.report()
        print "\nSecondary theme:"
        print stats_secondary.report()
        print "\nRecategorizations:"
        print stats_recategorize.report()

        if options.write:
            print 'Writing'
            model.Session.commit()
Exemplo n.º 48
0
def main():
    config = common.load_config()

    update_assembly_info(config)
    generate_version_file(config)
    update_readme(config)
Exemplo n.º 49
0
        current_value = a * b / c + value_breakpoint.value_min
        return current_value

    def get_breakpoints(self, value_type, sensor_value):
        return Breakpoints.query.filter(
            Breakpoints.value_min <= sensor_value,
            Breakpoints.value_max >= sensor_value,
            Breakpoints.sensor_value_type_id == value_type).first()

    def save_area_data(self, record_aqi, values, sensor_id):
        v = {}
        for key in values:
            if not isinstance(key, str):
                v[key.type] = values[key]
            else:
                v[key] = values[key]

        area = AreaModel(aqi=record_aqi, sensor_id=sensor_id, **v)
        self.db.session.add(area)
        self.db.session.commit()


if __name__ == '__main__':
    load_config()
    app = create_app()
    app.app_context().push()

    calc = AQICalculator(db)
    result = calc.execute()
Exemplo n.º 50
0
def upload(channel_directory, filename, filepath):
    config = load_config()

    dbx = dropbox.Dropbox(config['dropbox_api_access_token'])

    upload_chunk_size = config['dropbox_chunk_size_mb'] * 1024 * 1024

    # Dropbox doesn't support characters defined outside the BMP. This includes most, but not all emoji.
    filename = re.sub(r'[^\u0000-\uffff]', '', filename)

    full_path = DROPBOX_ROOT / channel_directory / filename

    log.info(f'Full upload path is {full_path}')

    total_size = filepath.stat().st_size
    total_chunks = (total_size // upload_chunk_size) + 1

    log.info(f'Uploading in {total_chunks} chunks')

    log.info('Starting session')
    session = dbx.files_upload_session_start(b'')

    uploaded = 0
    hasher = DropboxContentHasher()

    for chunk_num in range(total_chunks):
        log.info(f'Uploading chunk {chunk_num}')
        is_last_chunk = chunk_num == total_chunks - 1
        cursor = dropbox.files.UploadSessionCursor(
            session_id=session.session_id,
            offset=uploaded,
        )

        upload_chunk(dbx, cursor, filepath, chunk_num, upload_chunk_size,
                     hasher, is_last_chunk)

        uploaded += total_size % upload_chunk_size if is_last_chunk else upload_chunk_size

    log.info('Finishing session')
    file_metadata = dbx.files_upload_session_finish(
        b'',
        dropbox.files.UploadSessionCursor(
            session_id=session.session_id,
            offset=uploaded,
        ),
        dropbox.files.CommitInfo(path=str(full_path), ),
    )

    local_hash = hasher.hexdigest()
    remote_hash = file_metadata.content_hash

    # TODO: Actually take some sort of action based on this,
    # especially considering that downloaders log to some file
    # no one will ever see. Just retry reuploading it maybe?

    assert local_hash == remote_hash, f'Local hash {local_hash} and remote hash {remote_hash} do not match'

    # Get the shared link and thumbnail
    # XXX: Maybe we should fetch the Youtube thumbnail way earlier?
    shared_link = dbx.sharing_create_shared_link(str(full_path))
    _, thumbnail_resp = dbx.files_get_thumbnail(
        str(full_path),
        format=dropbox.files.ThumbnailFormat.png,
        size=dropbox.files.ThumbnailSize.w1024h768,
    )

    # This is probably pretty brittle
    url = shared_link.url.replace("www.dropbox", "dl.dropboxusercontent")
    return (url, thumbnail_resp.content)
Exemplo n.º 51
0
 def setUp(self):
     config = load_config()
     config['io_threads'] = 1
     self.ctx = FedMsgContext(**config)
Exemplo n.º 52
0
    def command(cls, config_ini, options, submissions_csv_filepath):

        # Inventive CSV. Columns:
        # applicationnumber, applicationdate, jobrole, laname, officerauthorised, theme, responsedate, acceptancestatus, odicertificateurl, dguurl, inventoryurl, localcodes, dataseturl, schemaurl, guidanceurl, frequencyofpublishing, foinumberest, submissioncomplete, lastlaupdate, techreviewstatus, lasttechupdate, adminreviewstatus, paymentamount, closed, lastadminupdate, applicantnotes, administrationnotes, technicalnotes, lastupdated
        with open(submissions_csv_filepath, 'rb') as f:
            csv = UnicodeCsvReader(f, encoding='iso-8859-1')
            header = csv.next()
            header = [col_name.strip().lower().replace(' ', '_') for col_name in header]
            Submission = namedtuple('Submission', header)
            submissions = [Submission(*row) for row in csv]

        if config_ini:
            # this is only for when running from the command-line
            #print 'Loading CKAN config...'
            common.load_config(config_ini)
            common.register_translator()
            #print '...done'

        from ckan import model
        from ckan.plugins import toolkit
        from ckanext.dgu.lib import helpers as dgu_helpers
        from ckanext.dgu.model.schema_codelist import Schema

        log = __import__('logging').getLogger(__name__)

        # Match the organizations in the submissions
        lga_orgs_by_dgu_org_name = {}
        accepted_submission_dgu_orgs = set()
        for submission in submissions:
            la_title = la_map.get(submission.laname, submission.laname)
            org = model.Session.query(model.Group) \
                       .filter_by(title=la_title) \
                       .first()
            assert org, 'Submission org title not found: %r' % la_title
            lga_orgs_by_dgu_org_name[org.name] = submission.laname
            if submission.acceptancestatus == 'Accepted':
                accepted_submission_dgu_orgs.add(org.name)

        stats = Stats()
        stats_incentive = Stats()
        results = []

        if options.write:
            rev = model.repo.new_revision()
            rev.author = 'script-%s.py' % __file__

        # Iterate over organizations
        if options.dataset:
            dataset = toolkit.get_action('package_show')(data_dict={'id': options.dataset})
            org_names = [dataset['organization']['name']]
        elif options.organization:
            org_names = [options.organization]
        elif options.incentive_only:
            org_names = sorted(accepted_submission_dgu_orgs)
        else:
            org_names = dgu_helpers.all_la_org_names()
        #print '%s organizations' % len(org_names)
        for org_name in org_names:
            org_title = model.Group.by_name(org_name).title
            lga_org = lga_orgs_by_dgu_org_name.get(org_name)

            # Iterate over the schemas
            if options.schema:
                schema = all_schemas_by_dgu_name[options.schema]
                if options.incentive_only and not schema.lga_name:
                    # not an incentive schema, so no results
                    schemas = []
                elif options.incentive_only:
                    schemas = [all_schemas_by_lga_name[submission.theme]
                               for submission in submissions
                               if submission.laname == lga_org
                               and submission.theme == schema.lga_name
                               and submission.acceptancestatus == 'Accepted']
                else:
                    schemas = [all_schemas_by_lga_name.get(
                               options.schema,
                               schema)]
            elif options.incentive_only:
                schemas = [all_schemas_by_lga_name[submission.theme]
                           for submission in submissions
                           if submission.laname == lga_org
                           and submission.acceptancestatus == 'Accepted']
            else:
                schemas = all_schemas
            #print '%s schemas' % len(schemas)
            for schema in schemas:

                # Find the relevant incentive submission
                if lga_org:
                    for submission in submissions:
                        if submission.laname == lga_org and \
                                submission.theme == schema.lga_name:
                            break
                    else:
                        submission = None
                else:
                    submission = None

                result = dict(
                    org_name=org_name,
                    org_title=org_title,
                    org_name_lga=submission.laname if submission else '',
                    schema_dgu_title=schema.dgu_schema_name,
                    schema_lga=schema.lga_name,
                    lga_application_number=submission.applicationnumber if submission else '',
                    lga_application_acceptance_status=submission.acceptancestatus if submission else '',
                    dataset_names=[],
                    dataset_titles=[],
                    dataset_schema_applied=[],
                    )

                stat_id = '%s %s' % (org_name, schema.lga_name)
                if submission:
                    stat_id += ' %s' % submission.applicationnumber

                def add_datasets_to_results(datasets, result):
                    for dataset in datasets:
                        if dataset['name'] not in result['dataset_names']:
                            result['dataset_names'].append(dataset['name'])
                            result['dataset_titles'].append(dataset['title'])
                            schema_applied = True if schema.dgu_schema_name in \
                                [s['title'] for s in dataset.get('schema', [])] \
                                else False
                            result['dataset_schema_applied'].append(schema_applied)
                            if not schema_applied and options.write:
                                pkg = model.Package.get(dataset['name'])
                                schema_obj = Schema.by_title(schema.dgu_schema_name)
                                assert schema_obj, schema.dgu_schema_name
                                try:
                                    schema_ids = json.loads(pkg.extras.get('schema') or '[]')
                                except ValueError:
                                    log.error('Not valid JSON in schema field: %s %r',
                                              dataset['name'], pkg.extras.get('schema'))
                                    schema_ids = []
                                schema_ids.append(schema_obj.id)
                                pkg.extras['schema'] = json.dumps(schema_ids)

                # Already a schema?
                data_dict = {'fq': 'publisher:%s ' % org_name +
                                   'schema_multi:"%s"' % schema.dgu_schema_name}
                datasets = toolkit.get_action('package_search')(data_dict=data_dict)
                if datasets['count'] > 0:
                    add_datasets_to_results(datasets['results'], result)
                    stats.add('OK - Dataset with schema',
                              stat_id + ' %s' % ';'.join(result['dataset_names']))
                    found_schema = True
                else:
                    found_schema = False

                # Submission specifies DGU dataset
                if submission and submission.dguurl:
                    match = re.match('http://data.gov.uk/dataset/(.*)', submission.dguurl)
                    if match:
                        dataset_name = dataset_name_original = match.groups()[0]
                        # some have trailing /
                        dataset_name = dataset_name.strip('/')
                        # hampshire have a hash appended
                        if '#' in dataset_name:
                            dataset_name = dataset_name.split('#')[0]
                        # poole have a resource name appended
                        if '/resource' in dataset_name:
                            dataset_name = dataset_name.split('/resource')[0]
                        # manual corrections
                        if dataset_name in dataset_name_corrections:
                            dataset_name = dataset_name_corrections[dataset_name]
                        dataset = model.Package.by_name(dataset_name)
                        # salford ones added a '1'
                        if not dataset:
                            dataset = model.Package.by_name(dataset_name + '1')
                            if dataset:
                                dataset_name += '1'

                        if dataset and dataset.state == 'active':
                            dataset_dict = toolkit.get_action('package_show')(data_dict={'id': dataset.id})
                            add_datasets_to_results([dataset_dict], result)
                            if dataset_name != dataset_name_original:
                                stats_incentive.add('OK - DGU Dataset listed and with corrections it checks out',
                                          stat_id + ' %s' % dataset_name)
                            else:
                                stats_incentive.add('OK - DGU Dataset listed and it checks out',
                                          stat_id + ' %s' % dataset_name)
                        elif dataset:
                            stats_incentive.add('ERROR - DGU Dataset listed BUT it is deleted!',
                                            '%s %s' % (stat_id, submission.dguurl))
                        else:
                            stats_incentive.add('ERROR - DGU Dataset listed BUT it is not found',
                                            '%s %s' % (stat_id, submission.dguurl))
                    else:
                        stats_incentive.add('ERROR - DGU Dataset listed BUT the URL is not the correct format',
                                        '%s %s' % (stat_id, submission.dguurl))

                # Submission mentions dataset on LA site - maybe it is in DGU already?
                elif submission and submission.dataseturl:
                    datasets = model.Session.query(model.Package) \
                                    .join(model.ResourceGroup) \
                                    .join(model.Resource) \
                                    .filter(model.Resource.url==submission.dataseturl) \
                                    .filter(model.Package.state=='active') \
                                    .filter(model.Resource.state=='active') \
                                    .all()
                    dataset_dicts = [
                        toolkit.get_action('package_show')(data_dict={'id': dataset.id})
                        for dataset in datasets]
                    add_datasets_to_results(dataset_dicts, result)
                    if len(datasets) > 1:
                        stats_incentive.add('No DGU Dataset, but Dataset URL matches multiple DGU datasets',
                                            '%s %s' % (stat_id, datasets[0].name))
                    elif len(datasets) == 0:
                        stats_incentive.add('No DGU Dataset and Dataset URL not found on DGU',
                                            stat_id)
                    else:
                        stats_incentive.add('No DGU Dataset, but Dataset URL matches DGU dataset',
                                            '%s %s' % (stat_id, datasets[0].name))

                # Search for datasets in the catalogue
                datasets = cls.find_dataset_for_schema(schema=schema, org_name=org_name)
                if datasets is None:
                    if not found_schema:
                        stats.add('Search revealed none', stat_id)
                elif len(datasets) > 1:
                    add_datasets_to_results(datasets, result)
                    if not found_schema:
                        stats.add('Found datasets (multiple) in search', '%s %r' % (stat_id, [d['name'] for d in datasets]))
                elif datasets:
                    add_datasets_to_results(datasets, result)
                    if not found_schema:
                        stats.add('Found dataset in search', '%s %s' % (stat_id, datasets[0]['name']))
                else:
                    if not found_schema:
                        stats.add('No dataset for submission', stat_id)

                results.append(result)

        rows_with_datasets_count = \
            len([result for result in results
                 if any(result['dataset_schema_applied'])])
        rows_with_datasets_or_candidate_datasets_count = \
            len([result for result in results
                 if result['dataset_schema_applied']])

        if options.print_:
            print '\n Incentive stats\n' + stats_incentive.report()
            print '\n Overall stats\n' + stats.report()

        if options.write:
            print 'Writing'
            model.Session.commit()

        return {'table': results,
                'rows_with_datasets_count': rows_with_datasets_count,
                'rows_with_datasets_or_candidate_datasets_count': rows_with_datasets_or_candidate_datasets_count}
Exemplo n.º 53
0
def init():
    global config
    config = common.load_config()
Exemplo n.º 54
0
        finished = True
        start = time.time()

        for iso in get_isos():
            if iso.match() and fileset.tas(iso.name):
                if not os.path.exists(iso.name):
                    iso.download()

            # links are time-sensitive; if get_isos() is out-of-date will need
            # to re-fetch
            if time.time() > start + 300:
                finished = False
                break

if __name__ == "__main__":
    config = common.load_config()
    args = parse_args()

    if args["list"]:
        for iso in get_isos():
            print("[%c] %s" % ([" ", "*"][iso.match()], iso.name))
        sys.exit(0)

    common.mkdirs(config["rhn-dumps-base"])
    os.chdir(config["rhn-dumps-base"])

    lock = common.Lock(".lock")

    threads = []
    for i in range(int(config["rhn-dumps-threads"])):
        t = threading.Thread(target = worker, name = i)
Exemplo n.º 55
0
if __name__ == '__main__':
    usage = """Tool to migrate QA data from TaskStatus to QA table

    usage: %prog [options] <ckan.ini>
    """
    parser = OptionParser(usage=usage)
    parser.add_option("-w", "--write",
                      action="store_true", dest="write",
                      help="write the changes")
    parser.add_option('-p', '--publisher', dest='publisher')
    parser.add_option('-d', '--dataset', dest='dataset')
    parser.add_option('-r', '--resource', dest='resource')
    (options, args) = parser.parse_args()
    if len(args) != 1:
        parser.error('Wrong number of arguments (%i)' % len(args))
    config_ini = args[0]
    print 'Loading CKAN config...'
    common.load_config(config_ini)
    common.register_translator()
    print 'Done'
    # Setup logging to print debug out for local only
    rootLogger = logging.getLogger()
    rootLogger.setLevel(logging.WARNING)
    localLogger = logging.getLogger(__name__)
    localLogger.setLevel(logging.DEBUG)
    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter('%(message)s'))
    localLogger.addHandler(handler)
    migrate(options)
Exemplo n.º 56
0
    def command(cls, config_ini, dataset_names, options):
        common.load_config(config_ini)
        common.register_translator()

        from pylons import config
        apikey = config['dgu.merge_datasets.apikey']
        ckan = ckanapi.RemoteCKAN('https://data.gov.uk', apikey=apikey)
        #ckan = ckanapi.LocalCKAN()

        if options.publisher:
            org_name = common.name_stripped_of_url(options.publisher)
            if options.search:
                results = ckan.action.package_search(q=options.search, fq='publisher:%s' % org_name, rows=100)
                dataset_names.extend([dataset['name']
                                      for dataset in results['results']])
            else:
                org = ckan.action.organization_show(id=org_name,
                                                    include_datasets=True)
                dataset_names.extend([d['name'] for d in org['packages']])


        datasets = []
        datasets_by_name = {}

        def get_extra(dataset, key):
            for extra in dataset['extras']:
                if extra['key'] == key:
                    return extra['value']
        for dataset_name in dataset_names:
            print 'Dataset: %s' % dataset_name
        for dataset_name in dataset_names:
            # strip off the url part of the dataset name, if there is one
            dataset_name = common.name_stripped_of_url(dataset_name)
            dataset = ckan.action.package_show(id=dataset_name)
            harvest_source_ref = get_extra(dataset, 'harvest_source_reference')
            if harvest_source_ref:
                print '** Discarding dataset %s due to harvest source: %s **' \
                    % (dataset_name, harvest_source_ref)
                continue
            datasets.append(dataset)
            datasets_by_name[dataset['name']] = dataset
        datasets.sort(key=lambda x: x['metadata_modified'])

        # aggregate resources
        def resource_identity(res_dict, dataset_name):
            return (res_dict.get('date'), res_dict['url'],
                    res_dict.get('title') or res_dict['description'],
                    res_dict.get('format'),
                    dataset_name)
        combined_resources = {}  # identity
        res_stats = Stats()
        for dataset in datasets:
            for resource in dataset['resources']:
                identity = resource_identity(resource, dataset['name'])
                resource['dataset_name'] = dataset['name']
                if identity in combined_resources:
                    print res_stats.add('Discarding duplicate', '\n%s duplicate of \n%s' % (resource, combined_resources[identity]))
                else:
                    combined_resources[identity] = resource
        resources = combined_resources.values()

        # find dates for resources
        # NB This has been pulled out into timeseries_convert.py -
        # TODO call that instead of having the code here too.
        if options.frequency:
            url_munge_re = re.compile('(%20|-|_|\.)')

            def fields_to_hunt_for_date(res):
                date = res.get('date')
                if date:
                    yield 'date', date
                title = res.get('title')
                if title:
                    yield 'title', title
                yield 'description', res['description']
                yield 'url', url_munge_re.sub(' ', res['url'])
                if not options.update:
                    dataset = datasets_by_name[res['dataset_name']]
                    yield 'dataset-title', dataset['title']
                    yield 'dataset-notes', dataset['notes']

            ensure_regexes_are_initialized()
            global regexes
            for resource in resources:
                for field_name, field_value in fields_to_hunt_for_date(resource):
                    if options.frequency in ('monthly', 'quarterly', 'twice annually'):
                        month, year = hunt_for_month_and_year(field_value)
                        if year and month:
                            resource['date'] = '%02d/%s' % (month, year)
                            res_stats.add('Found date in %s' % field_name,
                                          '%s %r' %
                                          (resource['date'], resource))
                            if resource.get('resource_type') == 'documentation':
                                resource['resource_type'] = 'file'
                                res_stats.add('Converted additional resource', resource)
                            break
                    elif options.frequency == 'annually':
                        year = regexes['year'].search(field_value)
                        if year:
                            resource['date'] = year.groups()[0]
                            res_stats.add('Found date in %s' % field_name,
                                          '%s %r' %
                                          (resource['date'], resource))
                            if resource.get('resource_type') == 'documentation':
                                resource['resource_type'] = 'file'
                                res_stats.add('Converted additional resource', resource)
                            break
                else:
                    if resource.get('resource_type') == 'documentation':
                        print res_stats.add('Could not find date but it\'s Additional Resource', resource)
                        continue
                    print res_stats.add('Could not find date', resource)
                    continue

            print 'Resources: \n', res_stats

            resources_without_date = [res for res in resources
                                      if not res.get('date') and
                                      res.get('resource_type') != 'documentation']
            for i, res in enumerate(resources_without_date):
                print 'Resources without dates %s/%s' % (i+1, len(resources_without_date))
                for field_name, field_value in fields_to_hunt_for_date(res):
                    print '  %s: %s' % (field_name, field_value.encode('latin-1', 'ignore'))
                print 'https://data.gov.uk/dataset/%s/resource/%s' % (res['dataset_name'], res['id'])
                date_format = {'annually': 'YYYY',
                               'monthly': 'MM/YYYY',
                               'twice annually': 'MM/YYYY',
                               'quarterly': 'MM/YYYY'}
                input_ = raw_input('Date (%s) or DOCS to make it an Additional Resource: ' %
                                   date_format[options.frequency])
                if input_.strip().lower() == 'docs':
                    res['date'] = ''
                    res['resource_type'] = 'documentation'
                else:
                    res['date'] = input_

            resources.sort(key=lambda x: x.get('date', '').split('/')[::-1])

        # Ensure there is not a mixture of resources with and without a date
        have_dates = None
        for res in resources:
            if res.get('resource_type') == 'documentation':
                continue
            if have_dates is None:
                have_dates = bool(res.get('date'))
            else:
                has_date = bool(res.get('date'))
                if has_date != have_dates:
                    print [res.get('date') for res in resources]
                    print 'Cannot mix resources with dates and others without!'
                    import pdb
                    pdb.set_trace()
Exemplo n.º 57
0
import socket
import json
import logging

from common import load_config, file_default_config
from datanode import DataNode

if __name__ == "__main__":
    # socket.setdefaulttimeout(20)
    config = load_config("config/datanode.json")
    client_datanode_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    host = ""
    port = config["client_comm_port"]
    client_datanode_socket.bind((host, port))
    client_datanode_socket.listen(config["max_client_num"])

    datanode_instance = DataNode(config)
    while True:
        client, addr = client_datanode_socket.accept()
        print("Connect from {}".format(addr))

        # receive command from client
        client_command = client.recv(2048).decode('utf-8')

        # accept command successfully
        client.send(bytes("ack", encoding="utf-8"))
        client_command = json.loads(client_command, encoding="utf-8")
        blockinfo = client_command.get("block_info", None)
        block = None
        ret_block = None
Exemplo n.º 58
0
def main():
    """
    Main function: run Infernal, filter results and flag RNA genes in TRAPID db.
    """
    cmd_args = parse_arguments()
    # Read experiment's initial processing configuration file
    config = common.load_config(cmd_args.ini_file_initial,
                                {"infernal", "trapid_db", "experiment"})
    # The web application sets the Rfam clan string to 'None' in case the user chose no clans
    # If this is the case, exit the script with an information message
    if config["infernal"]["rfam_clans"] == "None":
        sys.stderr.write(
            "[Message] No Rfam clans selected: skip ncRNA annotation step.\n")
        sys.exit()
    try:
        # Run Infernal, parse and export results to DB
        sys.stderr.write(
            '[Message] Starting ncRNA annotation procedure: %s\n' %
            time.strftime('%Y/%m/%d %H:%M:%S'))
        exp_id = config["experiment"]["exp_id"]
        tmp_exp_dir = config["experiment"]["tmp_exp_dir"]
        rfam_dir = config["infernal"]["rfam_dir"]
        exp_clans = config["infernal"]["rfam_clans"].split(",")
        # Lists containing all needed parameters for `common.db_connect()` (TRAPID + reference DB)
        trapid_db_data = common.get_db_connection_data(config, 'trapid_db')
        reference_db_data = common.get_db_connection_data(
            config, 'reference_db')
        db_connection = common.db_connect(*trapid_db_data)
        common.update_experiment_log(exp_id, 'start_nc_rna_search', 'Infernal',
                                     2, db_connection)
        db_connection.close()
        create_infernal_files(exp_id, tmp_exp_dir, rfam_dir, exp_clans,
                              trapid_db_data)
        # run_cmpress(exp_id=exp_id, tmp_exp_dir=tmp_exp_dir)
        total_m_nts = get_infernal_z_value(exp_id, trapid_db_data)
        infernal_tblout = run_infernal(exp_id, tmp_exp_dir, total_m_nts)
        # Filter Infernal tabulated output (keep best non-ovelrapping matches)
        # infernal_tblout_filtered = filter_out_overlaps(exp_id=exp_id, tmp_exp_dir=tmp_exp_dir, tblout_file=infernal_tblout)
        infernal_tblout_filtered = keep_best_results(exp_id, tmp_exp_dir,
                                                     infernal_tblout)
        # Get filtered results as list of dict and add clan information
        # Read Rfam clan information from `clanin` file. Would it make more sense to retrieve it when creating it?
        cm_clans = get_exp_cm_clans(exp_id, tmp_exp_dir)
        filtered_infernal_results = infernal_tblout_to_list(
            infernal_tblout_filtered, cm_clans)
        infernal_results = infernal_tblout_to_list(infernal_tblout, cm_clans)
        # Flag potential rna genes (set `is_rna_gene` value to 1 and `rf_ids` in `transcripts` table)
        flag_rna_genes(exp_id, filtered_infernal_results, trapid_db_data)
        # Store filtered results in `rna_similarities` ...
        store_rna_similarities(exp_id, infernal_results, trapid_db_data)
        # ... and `rna_families`
        store_rna_families(exp_id, filtered_infernal_results, trapid_db_data)
        # Annotate transcripts using GO terms from Rfam
        rfam_go = retrieve_rfam_go_data(trapid_db_data)
        go_data = get_go_data(reference_db_data)
        # perform_go_annotation(exp_id, infernal_results, rfam_go, go_data, tmp_exp_dir)
        perform_go_annotation(exp_id, filtered_infernal_results, rfam_go,
                              go_data, tmp_exp_dir)
        # That's it for now
        db_connection = common.db_connect(*trapid_db_data)
        common.update_experiment_log(exp_id, 'stop_nc_rna_search', 'Infernal',
                                     2, db_connection)
        db_connection.close()
        sys.stderr.write(
            '[Message] Finished ncRNA annotation procedure: %s\n' %
            time.strftime('%Y/%m/%d %H:%M:%S'))
    # If any exception was raised, update the experiment's log, set status to 'error', and exit
    except Exception:
        print_exc()
        common.stop_initial_processing_error(exp_id, trapid_db_data)
Exemplo n.º 59
0
import socket
import json

from common import load_config
from namenoode import FileSystem

normal_message = {"success": True, "message": "ack"}
normal_message_bytes = bytes(json.dumps(normal_message).encode('utf-8'))

if __name__ == "__main__":
    # socket.setdefaulttimeout(20)
    config = load_config("config/namenode.json")
    client_server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    host = ""
    port = config["client_comm_port"]
    client_server_socket.bind((host, port))
    client_server_socket.listen(config["max_client_num"])

    file_system = FileSystem(config=config["file_system_config"])
    while True:
        client, addr = client_server_socket.accept()
        print("client address: {}".format(addr))

        # recv command from client
        client_command = client.recv(4096).decode('utf-8')
        client_command = json.loads(client_command, encoding='utf-8')

        if client_command["command"] == "test_list":
            file_system.test_out()
            client.send(normal_message_bytes)
        if client_command["command"] == "list":