Python splitList 예제들, gkutils.commonutils.splitList Python 예제들

예제 #1

0

파일 보기

    def run(options, inputData):
        import yaml
        with open(options.configFile) as yaml_file:
            config = yaml.safe_load(yaml_file)

        username = config['cassandra']['local']['username']
        password = config['cassandra']['local']['password']
        keyspace = config['cassandra']['local']['keyspace']
        hostname = config['cassandra']['local']['hostname']

        db = {
            'username': username,
            'password': password,
            'keyspace': keyspace,
            'hostname': hostname
        }

        # Get n lightcurves. Consider doing this in parallel for a proper test.
        # As an initial test, run it single threaded.

        # We have the inputData, get a random subset.
        subset = inputData
        if len(inputData) > int(options.number):
            subset = random.sample(inputData, int(options.number))

        if int(options.nprocesses) > 1 and len(subset) > 1:
            # Do it in parallel!
            currentDate = datetime.now().strftime("%Y:%m:%d:%H:%M:%S")
            (year, month, day, hour, min, sec) = currentDate.split(':')
            dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec)
            nProcessors, listChunks = splitList(subset,
                                                bins=int(options.nprocesses),
                                                preserveOrder=True)

            print("%s Parallel Processing..." %
                  (datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
            parallelProcess(db,
                            dateAndTime,
                            nProcessors,
                            listChunks,
                            worker,
                            miscParameters=[options],
                            drainQueues=False)
            print("%s Done Parallel Processing" %
                  (datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
        else:
            cluster = Cluster(db['hostname'])
            session = cluster.connect()
            session.row_factory = dict_factory
            session.set_keyspace(db['keyspace'])

            lightcurves = getLCByObject(options, session, subset)
            #            for k,v in lightcurves.items():
            #                print(k, v)

            cluster.shutdown()

예제 #2

0

파일 보기

def loadGenericCassandraTable(session, table, data, bundlesize=1, types=None):

    if len(data) == 0:
        return

    keys = list(data[0].keys())
    typesDict = OrderedDict()

    if types is not None:
        i = 0
        for k in keys:
            typesDict[k] = types[i]
            i += 1

    formatSpecifier = ','.join(['%s' for i in keys])

    chunks = int(1.0 * len(data) / bundlesize + 0.5)
    if chunks == 0:
        subList = [data]
    else:
        bins, subList = splitList(data, bins=chunks, preserveOrder=True)

    for dataChunk in subList:
        try:
            sql = "insert into %s " % table
            sql += "(%s)" % ','.join(['%s' % k for k in keys])
            sql += " values "
            sql += ',\n'.join(
                ['(' + formatSpecifier + ')' for x in range(len(dataChunk))])
            sql += ';'

            values = []
            for row in dataChunk:
                # The data comes from a CSV. We need to cast the results using the types.
                for key in keys:
                    if types is not None:
                        value = nullValueNULL(boolToInteger(row[key]))
                        if value is not None:
                            value = eval(typesDict[key])(value)
                        values.append(value)
                    # The data is already in the right python type.
                    else:
                        value = row[key]
                        values.append(value)

#            print(sql, tuple(values))
            session.execute(sql, tuple(values))

        except Exception as e:
            print("Cassandra loading EXCEPTION", e)
            #print "Error %d: %s" % (e.args[0], e.args[1])

    return

예제 #3

0

파일 보기

파일: ingestGenericDatabaseTable.py 프로젝트: genghisken/gkdbutils

def executeLoad(conn, table, data, bundlesize=100):
    import MySQLdb

    rowsUpdated = 0

    if len(data) == 0:
        return rowsUpdated

    keys = list(data[0].keys())
    formatSpecifier = ','.join(['%s' for i in keys])

    chunks = int(1.0 * len(data) / bundlesize + 0.5)
    if chunks == 0:
        subList = [data]
    else:
        bins, subList = splitList(data, bins=chunks, preserveOrder=True)

    for dataChunk in subList:
        try:
            cursor = conn.cursor(MySQLdb.cursors.DictCursor)

            sql = "insert ignore into %s " % table
            sql += "(%s)" % ','.join(['`%s`' % k for k in keys])
            sql += " values "
            sql += ',\n'.join(
                ['(' + formatSpecifier + ')' for x in range(len(dataChunk))])
            sql += ';'

            values = []
            for row in dataChunk:
                for key in keys:
                    values.append(nullValueNULL(boolToInteger(row[key])))

            cursor.execute(sql, tuple(values))

            rowsUpdated = cursor.rowcount
            cursor.close()

        except MySQLdb.Error as e:
            print(cursor._last_executed)
            print("Error %d: %s" % (e.args[0], e.args[1]))

        conn.commit()

    return rowsUpdated

예제 #4

0

파일 보기

파일: ingestGenericDatabaseTable.py 프로젝트: genghisken/gkdbutils

def ingestDataMultiprocess(options):

    currentDate = datetime.now().strftime("%Y:%m:%d:%H:%M:%S")
    (year, month, day, hour, min, sec) = currentDate.split(':')
    dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec)

    nProcessors, fileSublist = splitList(options.inputFile,
                                         bins=int(options.nprocesses),
                                         preserveOrder=True)

    print("%s Parallel Processing..." %
          (datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
    parallelProcess([],
                    dateAndTime,
                    nProcessors,
                    fileSublist,
                    workerIngest,
                    miscParameters=[options],
                    drainQueues=False)
    print("%s Done Parallel Processing" %
          (datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))

예제 #5

0

파일 보기

def ingestDataMultiprocess(options, fkDict = None):

    currentDate = datetime.now().strftime("%Y:%m:%d:%H:%M:%S")
    (year, month, day, hour, min, sec) = currentDate.split(':')
    dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec)

    # Read the contents of the input file(s) to get the filenames to process.
    files = options.inputFile

    if options.fileoffiles:
        files = []
        for f in options.inputFile:
            with open(f) as fp:
                content = fp.readlines()
                content = [filename.strip() for filename in content]
            files += content

    print(files)
    nProcessors, fileSublist = splitList(files, bins = int(options.nfileprocesses), preserveOrder=True)
    
    print("%s Parallel Processing..." % (datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
    parallelProcess([], dateAndTime, nProcessors, fileSublist, workerIngest, miscParameters = [options, fkDict], drainQueues = False)
    print("%s Done Parallel Processing" % (datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))

예제 #6

0

파일 보기

파일: ingestGenericDatabaseTable.py 프로젝트: genghisken/gkdbutils

def ingestData(options, inputFiles):
    generateHtmidBulk = which('generate_htmid_bulk')
    if generateHtmidBulk is None:
        sys.stderr.write(
            "Can't find the generate_htmid_bulk executable, so cannot continue.\n"
        )
        exit(1)

    import yaml
    with open(options.configFile) as yaml_file:
        config = yaml.load(yaml_file)

    username = config['databases']['local']['username']
    password = config['databases']['local']['password']
    database = config['databases']['local']['database']
    hostname = config['databases']['local']['hostname']

    db = {
        'username': username,
        'password': password,
        'database': database,
        'hostname': hostname
    }

    currentDate = datetime.now().strftime("%Y:%m:%d:%H:%M:%S")
    (year, month, day, hour, min, sec) = currentDate.split(':')
    dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec)

    for inputFile in inputFiles:
        print("Ingesting %s" % inputFile)
        if 'gz' in inputFile:
            # It's probably gzipped
            f = gzip.open(inputFile, 'rb')
            print(type(f).__name__)
        else:
            f = inputFile

        data = readGenericDataFile(f, delimiter=',', useOrderedDict=True)
        pid = os.getpid()

        tempRADecFile = '/tmp/' + os.path.basename(inputFile) + 'radec_' + str(
            pid)
        tempLoadFile = '/tmp/' + os.path.basename(inputFile) + '_' + str(
            pid) + '.csv'

        with open(tempRADecFile, 'wb') as f:
            for row in data:
                f.write('%s %s\n' % (row['ra'], row['dec']))

        htm10IDs = calculate_htm_ids_bulk(generateHtmidBulk, 10, tempRADecFile)
        htm13IDs = calculate_htm_ids_bulk(generateHtmidBulk, 13, tempRADecFile)
        htm16IDs = calculate_htm_ids_bulk(generateHtmidBulk, 16, tempRADecFile)

        os.remove(tempRADecFile)

        for i in range(len(data)):
            # Add the HTM IDs to the data
            data[i]['htm10ID'] = htm10IDs[i]
            data[i]['htm13ID'] = htm13IDs[i]
            data[i]['htm16ID'] = htm16IDs[i]

        nprocesses = int(options.nprocesses)

        if len(data) > 0:
            nProcessors, listChunks = splitList(data,
                                                bins=nprocesses,
                                                preserveOrder=True)

            print("%s Parallel Processing..." %
                  (datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
            parallelProcess(db,
                            dateAndTime,
                            nProcessors,
                            listChunks,
                            workerInsert,
                            miscParameters=[options],
                            drainQueues=False)
            print("%s Done Parallel Processing" %
                  (datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))

예제 #7

0

파일 보기

def ingestData(options, inputFiles, fkDict = None):

    import yaml
    with open(options.configFile) as yaml_file:
        config = yaml.safe_load(yaml_file)

    username = config['cassandra']['local']['username']
    password = config['cassandra']['local']['password']
    keyspace = config['cassandra']['local']['keyspace']
    hostname = config['cassandra']['local']['hostname']

    db = {'username': username,
          'password': password,
          'keyspace': keyspace,
          'hostname': hostname}

    currentDate = datetime.now().strftime("%Y:%m:%d:%H:%M:%S")
    (year, month, day, hour, min, sec) = currentDate.split(':')
    dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec)

    delimiter=options.tableDelimiter
    if delimiter == '\\s':
        delimiter = ' '
    if delimiter == '\\t':
        delimiter = '\t'

    for inputFile in inputFiles:
        print("Ingesting %s" % inputFile)
        if '.gz' in inputFile:
            # It's probably gzipped
            f = gzip.open(inputFile, 'rb')
            print(type(f).__name__)
        else:
            f = inputFile
    
        if 'avro' in inputFile:
            # Data is in Avro packets, with schema. Let's hard-wire to the ZTF schema for the time being.
            avroData = readZTFAvroPacket(f, addhtm16 = True)
            if 'noncandidates' in options.table:
                data = avroData['noncandidates']
            elif 'candidates' in options.table:
                data = avroData['candidates']
            else:
                print("Error. Incorrect table definition for Avro packets. Must contain candidates or noncandidates.")
                exit(1)

        else:
            # Data is in plain text file. No schema present, so will need to provide
            # column types.
            data = readGenericDataFile(f, delimiter=delimiter, useOrderedDict=True)

        # 2021-07-29 KWS This is a bit inefficient, but trim the data down to specified columns if they are present.
        if options.columns:
            trimmedData = []
            for row in data:
                trimmedRow = {key: row[key] for key in options.columns.split(',')}
                trimmedData.append(trimmedRow)
            data = trimmedData


        foreignKey = options.fkfrominputdata
        if foreignKey == 'filename':
            foreignKey = os.path.basename(inputFile).split('.')[0]


        if fkDict:
            for i in range(len(data)):
                try:
                    if options.fktablecols:
                        # just pick out the specified keys
                        keys = options.fktablecols.split(',')
                        for k in keys:
                            data[i][k] = fkDict[foreignKey][k]
                    else:
                        # Use all the keys by default
                        for k,v in fkDict[foreignKey].items():
                            data[i][k] = v
                except KeyError as e:
                    pass

        #print(data[0])
        pid = os.getpid()
    
        if not options.skiphtm:
    
            coords = []
            for row in data:
                coords.append([float(row[options.racol]), float(row[options.deccol])])
    
            htm16Names = htmNameBulk(16, coords)

            # For Cassandra, we're going to split the HTM Name across several columns.
            # Furthermore, we only need to do this once for the deepest HTM level, because
            # This is always a subset of the higher levels.  Hence we only need to store
            # the tail end of the HTM name in the actual HTM 16 column.  So...  we store
            # the full HTM10 name as the first 12 characters of the HTM 16 one, then the
            # next 3 characters into the HTM 13 column, then the next 3 characters (i.e.
            # the last few characters) the HTM 16 column
            # e.g.:
            # ra, dec =      288.70392, 9.99498
            # HTM 10  = N02323033011
            # HTM 13  = N02323033011 211
            # HTM 16  = N02323033011 211 311

            # Incidentally, this hierarchy also works in binary and we should seriously
            # reconsider how we are currently using HTMs.

            # HTM10 ID =    13349829 = 11 00 10 11 10 11 00 11 11 00 01 01
            # HTM13 ID =   854389093 = 11 00 10 11 10 11 00 11 11 00 01 01  10 01 01
            # HTM16 ID = 54680902005 = 11 00 10 11 10 11 00 11 11 00 01 01  10 01 01  11 01 01


            for i in range(len(data)):
                # Add the HTM IDs to the data
                data[i]['htm10'] = htm16Names[i][0:12]
                data[i]['htm13'] = htm16Names[i][12:15]
                data[i]['htm16'] = htm16Names[i][15:18]
    
    
        nprocesses = int(options.nprocesses)
    
        if len(data) > 0:
            nProcessors, listChunks = splitList(data, bins = nprocesses, preserveOrder=True)
    
            print("%s Parallel Processing..." % (datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
            parallelProcess(db, dateAndTime, nProcessors, listChunks, workerInsert, miscParameters = [options], drainQueues = False)
            print("%s Done Parallel Processing" % (datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))

예제 #8

0

파일 보기

def executeLoad(session, table, data, bundlesize = 1, types = None):

    rowsUpdated = 0

    if len(data) == 0:
        print('No data!')
        return rowsUpdated

    #if types is None:
    #    return rowsUpdated

    keys = list(data[0].keys())

    typesDict = OrderedDict()

    if types is not None:
        if len(keys) != len(types):
            print("Keys & Types mismatch")
            return rowsUpdated
        i = 0
        for k in keys:
            typesDict[k] = types[i]
            i += 1


    formatSpecifier = ','.join(['%s' for i in keys])

    chunks = int(1.0 * len(data) / bundlesize + 0.5)
    if chunks == 0:
        subList = [data]
    else:
        bins, subList = splitList(data, bins = chunks, preserveOrder = True)


    for dataChunk in subList:
        try:
            sql = "insert into %s " % table
            # Force all keys to be lowercase and devoid of hyphens
            sql += "(%s)" % ','.join(['%s' % k.lower().replace('-','') for k in keys])

            sql += " values "
            sql += ',\n'.join(['('+formatSpecifier+')' for x in range(len(dataChunk))])
            sql += ';'

            values = []

            for row in dataChunk:
                # If data comes from a CSV. We need to cast the results using the types. Otherwise assume
                # the types are already correct. (E.g. data read from an Avro file.)
                for key in keys:
                    if types is not None:
                        value = nullValueNULL(boolToInteger(row[key]))
                        if value is not None:
                            value = eval(typesDict[key])(value)
                        values.append(value)
                    # The data is already in the right python type. (Actually it doesn't matter! All the values are strings!)
                    else:
                        value = row[key]
                        values.append(value)


            #print(sql, tuple(values))
            session.execute(sql, tuple(values))


        except Exception as e:
            template = "An exception of type {0} occurred. Arguments:\n{1!r}"
            message = template.format(type(e).__name__, e.args)
            print(message)

    return

예제 #9

0

파일 보기

파일: getATLASForcedPhotometryMultiprocess.py 프로젝트: genghisken/psat-server

def main(argv=None):
    """main.

    Args:
        argv:
    """

    opts = docopt(__doc__, version='0.1')
    opts = cleanOptions(opts)
    options = Struct(**opts)

    configFile = options.configfile

    import yaml
    with open(configFile) as yaml_file:
        config = yaml.load(yaml_file)

    username = config['databases']['local']['username']
    password = config['databases']['local']['password']
    database = config['databases']['local']['database']
    hostname = config['databases']['local']['hostname']

    MAX_NUMBER_OF_OBJECTS = int(
        config['postage_stamp_parameters']['max_number_of_objects'])

    db = []
    db.append(username)
    db.append(password)
    db.append(database)
    db.append(hostname)

    detectionList = 1
    customList = None

    conn = dbConnect(hostname, username, password, database)

    update = options.update
    limit = int(options.limit)
    limitafter = int(options.limitafter)

    mlscore = None
    if options.mlscore is not None:
        mlscore = float(options.mlscore)

    objectList = []

    flagDate = '2015-12-20'
    if options.flagdate is not None:
        try:
            flagDate = '%s-%s-%s' % (options.flagdate[0:4],
                                     options.flagdate[4:6],
                                     options.flagdate[6:8])
        except:
            flagDate = '2015-12-20'

    if options.candidate is not None and len(options.candidate) > 0:
        for cand in options.candidate:
            obj = getATLASObject(conn, objectId=int(cand))
            if obj:
                objectList.append(obj)
    else:

        if options.customlist is not None:
            if int(options.customlist) > 0 and int(options.customlist) < 100:
                customList = int(options.customlist)
                objectList = getObjectsByCustomList(conn,
                                                    customList,
                                                    processingFlags=0)
            else:
                print(
                    "The list must be between 1 and 100 inclusive.  Exiting.")
                sys.exit(1)
        else:
            if options.detectionlist is not None:
                if int(options.detectionlist) >= 0 and int(
                        options.detectionlist) < 9:
                    detectionList = int(options.detectionlist)
                    objectList = getObjectsByList(conn,
                                                  listId=detectionList,
                                                  dateThreshold=flagDate,
                                                  processingFlags=0)
                else:
                    print(
                        "The list must be between 0 and 9 inclusive.  Exiting."
                    )
                    sys.exit(1)

    print("LENGTH OF OBJECTLIST = ", len(objectList))

    if mlscore is not None and not (
            options.candidate
    ):  # Only do this filter if the IDs are not provided explicitly.
        updatedList = []
        for row in objectList:
            if row['zooniverse_score'] is not None and row[
                    'zooniverse_score'] >= mlscore:
                updatedList.append(row)
        if len(updatedList) > 0:
            objectList = updatedList
            print("LENGTH OF CLIPPED OBJECTLIST = ", len(objectList))

    currentDate = datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S")
    (year, month, day, hour, min, sec) = currentDate.split(':')
    dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec)

    # Single threaded
    #perObjectExps, exposureSet = getForcedPhotometryUniqueExposures(conn, objectList, discoveryLimit = limit, ddc = options.ddc, useFlagDate = options.useflagdate)
    perObjectExps, exposureSet = getForcedPhotometryUniqueExposures(
        conn,
        objectList,
        discoveryLimit=limit,
        cutoffLimit=limitafter,
        ddc=options.ddc,
        useFlagDate=options.useflagdate)
    if options.test:
        for obj in objectList:
            print(obj['id'])
            for exp in perObjectExps[obj['id']]['exps']:
                print(exp)
        return 0
    # We'll hand the entire perObjectExps dictionary to each thread.

    # Download threads with multiprocessing - try 10 threads by default
    print("TOTAL OBJECTS = %d" % len(exposureSet))

    print("Downloading exposures...")

    if not options.skipdownload:
        if len(exposureSet) > 0:
            nProcessors, listChunks = splitList(exposureSet,
                                                bins=int(
                                                    options.downloadthreads))

            print("%s Parallel Processing..." %
                  (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
            parallelProcess(db,
                            dateAndTime,
                            nProcessors,
                            listChunks,
                            workerExposureDownloader,
                            miscParameters=[options],
                            drainQueues=False)
            print("%s Done Parallel Processing" %
                  (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))

            # Belt and braces - try again with one less thread, just in case the previous one failed.
            nProcessors, listChunks = splitList(
                exposureSet, bins=int(options.downloadthreads) - 1)

            print("%s Parallel Processing..." %
                  (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
            parallelProcess(db,
                            dateAndTime,
                            nProcessors,
                            listChunks,
                            workerExposureDownloader,
                            miscParameters=[options],
                            drainQueues=False)
            print("%s Done Parallel Processing" %
                  (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))

    # Produce stamps with multiprocessing - try n(CPUs) threads by default
    print("Doing Forced Photometry...")

    if len(objectList) > 0:
        nProcessors, listChunks = splitList(objectList)

        print("%s Parallel Processing..." %
              (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
        objectsForUpdate = parallelProcess(
            db,
            dateAndTime,
            nProcessors,
            listChunks,
            workerForcedPhotometry,
            miscParameters=[options, perObjectExps])
        print("%s Done Parallel Processing" %
              (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
        if len(objectsForUpdate) > 0 and update:
            insertForcedPhotometry(conn, objectsForUpdate)

    conn.close()

    return 0

예제 #10

0

파일 보기

def main():
    opts = docopt(__doc__, version='0.1')
    opts = cleanOptions(opts)
    options = Struct(**opts)

    configFile = options.configfile
    regex = options.regex

    import yaml
    with open(configFile) as yaml_file:
        config = yaml.safe_load(yaml_file)

    username = config['databases']['local']['username']
    password = config['databases']['local']['password']
    database = config['databases']['local']['database']
    hostname = config['databases']['local']['hostname']

    db = []
    db.append(username)
    db.append(password)
    db.append(database)
    db.append(hostname)

    conn = dbConnect(hostname, username, password, database)

    warnings.filterwarnings("ignore")

    # Parse command line

    currentDate = datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S")
    (year, month, day, hour, min, sec) = currentDate.split(':')
    dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec)

    pid = int(options.pid)
    maxjobs = int(options.maxjobs)
    days = int(options.days)
    camera = options.camera
    try:
        mjdToIngest = options.mjd
    except TypeError as e:
        mjdToIngest = None

    print("camera =", camera)
    print("regex =", regex)

    todayMJD = getCurrentMJD()

    # Use + 1 to include today!
    mjdthreshold = int(todayMJD) - days + 1

    # Specified MJD trumps mjd Threshold, so just go as far back
    # as the specified date
    if mjdToIngest:
        mjdthreshold = int(mjdToIngest[0:5]) - 1

    ingester = options.ingester

    fileList = getFiles(regex,
                        camera,
                        mjdToIngest=mjdToIngest,
                        mjdthreshold=mjdthreshold,
                        days=days,
                        atlasroot=options.atlasroot,
                        options=options)
    ingestedFiles = getFilesIngestedddc2(conn,
                                         mjdthreshold=mjdthreshold,
                                         camera=camera)

    fileListDict = OrderedDict()

    print("List of files...")
    for row in fileList:
        fileListDict[os.path.basename(row)] = row
        print(row)

    print("List of ingested files...")
    for row in ingestedFiles:
        print(row)

    filesToIngest = [
        fileListDict[x]
        for x in list(set(fileListDict.keys()) - set(ingestedFiles))
    ]
    filesToIngest.sort()

    print("List of files to ingest...")
    for row in filesToIngest:
        print(row)

    print("TOTAL OBJECTS TO CHECK = %d" % len(filesToIngest))

    if len(fileList) > 0:
        # 2018-02-06 KWS Use half the default number of processes. This may ironically speed up ingest.
        nProcessors, listChunks = splitList(filesToIngest, bins=28)

        print("%s Parallel Processing..." %
              (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
        parallelProcess(db,
                        dateAndTime,
                        nProcessors,
                        listChunks,
                        worker,
                        miscParameters=[options],
                        drainQueues=False)
        print("%s Done Parallel Processing" %
              (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))

    conn.close()
    return 0

예제 #11

0

파일 보기

파일: coneSearchCassandra.py 프로젝트: genghisken/gkutils

def main(argv=None):
    opts = docopt(__doc__, version='0.1')
    opts = cleanOptions(opts)

    # Use utils.Struct to convert the dict into an object for compatibility with old optparse code.
    options = Struct(**opts)

    #keyspace = 'atlas'
    #host = ['db0', 'db1', 'db2', 'db3', 'db4']

    # random star
    #ra = 83.20546
    #dec = -20.70055

    # ATLAS17nij
    #ra = 82.46704
    #dec = -19.52058

    # ATLAS20biio
    #ra = 83.24691
    #dec = -19.11739

    # ATLAS20bbio - very good!!
    #ra = 81.27903
    #dec = -21.24643

    # ATLAS18vre
    #ra = 84.19551
    #dec = -22.41100

    # ATLAS19bdbm
    #ra = 85.10436
    #dec = -18.09766

    # ATLAS20bbff
    #ra = 86.52075
    #dec = -23.56601

    # ATLAS20ymv - THIS IS the CENTRE OBJECT. We did a 10 degree sweep around this.
    #ra = 74.55677
    #dec = -20.35753

    # ATLAS17lvn - bright foreground star
    #ra = 68.75953
    #dec = -14.22797

    import yaml
    with open(options.configFile) as yaml_file:
        config = yaml.safe_load(yaml_file)

    username = config['cassandra']['local']['username']
    password = config['cassandra']['local']['password']
    keyspace = config['cassandra']['local']['keyspace']
    hostname = config['cassandra']['local']['hostname']

    db = {
        'username': username,
        'password': password,
        'keyspace': keyspace,
        'hostname': hostname
    }

    coordslist = []

    if options.coordsfromfile:
        coordslist = readGenericDataFile(options.coords, delimiter=',')
    else:
        coordslist.append({
            'ra': options.coords.split(',')[0],
            'dec': options.coords.split(',')[1]
        })

    if options.number and int(options.number) < len(coordslist):
        coordslist = random.sample(coordslist, int(options.number))

    if int(options.nprocesses) > 1 and len(coordslist) > 1:
        # Do it in parallel!
        currentDate = datetime.now().strftime("%Y:%m:%d:%H:%M:%S")
        (year, month, day, hour, min, sec) = currentDate.split(':')
        dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec)
        nProcessors, listChunks = splitList(coordslist,
                                            bins=int(options.nprocesses),
                                            preserveOrder=True)

        print("%s Parallel Processing..." %
              (datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
        parallelProcess(db,
                        dateAndTime,
                        nProcessors,
                        listChunks,
                        worker,
                        miscParameters=[options],
                        drainQueues=False)
        print("%s Done Parallel Processing" %
              (datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
    else:
        cluster = Cluster(db['hostname'])
        session = cluster.connect()
        session.row_factory = dict_factory
        session.set_keyspace(db['keyspace'])

        getLCData(options, session, coordslist)

        cluster.shutdown()

예제 #12

0

파일 보기

def main():
    """main.
    """

    opts = docopt(__doc__, version='0.1')
    opts = cleanOptions(opts)
    options = Struct(**opts)

    configFile = options.configfile

    import yaml
    with open(configFile) as yaml_file:
        config = yaml.load(yaml_file)

    username = config['databases']['local']['username']
    password = config['databases']['local']['password']
    database = config['databases']['local']['database']
    hostname = config['databases']['local']['hostname']

    MAX_NUMBER_OF_OBJECTS = int(
        config['postage_stamp_parameters']['max_number_of_objects'])

    db = []
    db.append(username)
    db.append(password)
    db.append(database)
    db.append(hostname)

    detectionList = 1
    customList = None

    conn = dbConnect(hostname, username, password, database)

    update = options.update
    limit = int(options.limit)
    mostRecent = not (options.earliest)
    nondetections = options.nondetections
    discoverylimit = int(options.discoverylimit)
    lastdetectionlimit = int(options.lastdetectionlimit)

    objectList = []

    try:
        requestType = REQUESTTYPES[options.requesttype]
    except KeyError as e:
        requestType = REQUESTTYPES['incremental']

    print("REQUEST TYPE = ", requestType)

    flagDate = '2015-12-20'
    if options.flagdate is not None:
        try:
            flagDate = '%s-%s-%s' % (options.flagdate[0:4],
                                     options.flagdate[4:6],
                                     options.flagdate[6:8])
        except:
            flagDate = '2015-12-20'

    if options.candidate is not None and len(options.candidate) > 0:
        for cand in options.candidate:
            objectList.append({'id': int(cand)})
    else:

        if options.customlist is not None:
            if int(options.customlist) > 0 and int(options.customlist) < 100:
                customList = int(options.customlist)
                objectList = getObjectsByCustomList(conn, customList)
            else:
                print(
                    "The list must be between 1 and 100 inclusive.  Exiting.")
                sys.exit(1)
        else:
            if options.detectionlist is not None:
                if int(options.detectionlist) >= 0 and int(
                        options.detectionlist) < 9:
                    detectionList = int(options.detectionlist)
                    objectList = getObjectsByList(conn,
                                                  listId=detectionList,
                                                  dateThreshold=flagDate)
                else:
                    print(
                        "The list must be between 0 and 6 inclusive.  Exiting."
                    )
                    sys.exit(1)

    currentDate = datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S")
    (year, month, day, hour, min, sec) = currentDate.split(':')
    dateAndTime = "%s%s%s_%s%s%s" % (year, month, day, hour, min, sec)

    if len(objectList) > MAX_NUMBER_OF_OBJECTS:
        sys.stderr.write(
            "The number of objects (%d) exceeds the maximum allowed (%d). Cannot continue.\n"
            % (len(objectList), MAX_NUMBER_OF_OBJECTS))
        sys.exit(1)

    # Only download exposures if requested. Otherwise assume we already HAVE the data.
    if not options.skipdownload:
        exposureSet = getUniqueExposures(conn,
                                         objectList,
                                         limit=limit,
                                         mostRecent=mostRecent,
                                         nonDets=nondetections,
                                         discoveryLimit=discoverylimit,
                                         lastDetectionLimit=lastdetectionlimit,
                                         requestType=requestType,
                                         ddc=options.ddc)

        # Download threads with multiprocessing - try 10 threads by default
        print("TOTAL OBJECTS = %d" % len(exposureSet))

        print("Downloading exposures...")

        if len(exposureSet) > 0:
            nProcessors, listChunks = splitList(exposureSet,
                                                bins=int(
                                                    options.downloadthreads))

            print("%s Parallel Processing..." %
                  (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
            parallelProcess(db,
                            dateAndTime,
                            nProcessors,
                            listChunks,
                            workerImageDownloader,
                            miscParameters=[options],
                            drainQueues=False)
            print("%s Done Parallel Processing" %
                  (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))

            # Belt and braces. Do again, with one less thread.
            nProcessors, listChunks = splitList(
                exposureSet, bins=int(options.downloadthreads) - 1)

            print("%s Parallel Processing..." %
                  (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
            parallelProcess(db,
                            dateAndTime,
                            nProcessors,
                            listChunks,
                            workerImageDownloader,
                            miscParameters=[options],
                            drainQueues=False)
            print("%s Done Parallel Processing" %
                  (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))

    # Produce stamps with multiprocessing - try n(CPUs) threads by default
    print("Producing stamps...")

    if len(objectList) > 0:
        nProcessors, listChunks = splitList(objectList, bins=48)

        print("%s Parallel Processing..." %
              (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))
        parallelProcess(db,
                        dateAndTime,
                        nProcessors,
                        listChunks,
                        workerStampCutter,
                        miscParameters=[
                            limit, mostRecent, nondetections, discoverylimit,
                            lastdetectionlimit, requestType, options.ddc,
                            options.wpwarp, options
                        ],
                        drainQueues=False)
        print("%s Done Parallel Processing" %
              (datetime.datetime.now().strftime("%Y:%m:%d:%H:%M:%S")))

    conn.close()

    return 0