Python Mongo.insert_many示例

编程语言: Python

命名空间/包名称: utils

类/类型: Mongo

方法/功能: insert_many

hotexamples.com的示例: 7

Python Mongo.insert_many - 已找到7个示例。这些是从开源项目中提取的最受好评的utils.Mongo.insert_many现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

Mongo(15)

insert_many(7)

close(1)

convert_cursor_to_dataframe(1)

get_mongo_client(1)

process_item(1)

update_one(1)

示例#1

显示文件

def get_ops():
    """
        Fetch and ingest ZTF ops data
    """

    # connect to MongoDB:
    print(f'{time_stamp()}: Connecting to DB.')
    mongo = Mongo(host=config['database']['host'],
                  port=config['database']['port'],
                  username=config['database']['username'],
                  password=config['database']['password'],
                  db=config['database']['db'],
                  verbose=0)
    print(f'{time_stamp()}: Successfully connected.')

    collection = 'ZTF_ops'

    print(f'{time_stamp()}: Checking indexes.')
    mongo.db[collection].create_index(
        [('coordinates.radec_geojson', '2dsphere')], background=True)
    mongo.db[collection].create_index([('utc_start', pymongo.ASCENDING),
                                       ('utc_end', pymongo.ASCENDING),
                                       ('fileroot', pymongo.ASCENDING)],
                                      background=True)
    mongo.db[collection].create_index([('jd_start', pymongo.ASCENDING),
                                       ('jd_end', pymongo.ASCENDING),
                                       ('fileroot', pymongo.ASCENDING)],
                                      background=True)
    mongo.db[collection].create_index([('jd_start', pymongo.DESCENDING),
                                       ('pid', pymongo.ASCENDING),
                                       ('field', pymongo.ASCENDING)],
                                      background=True)

    # fetch full table
    print(f'{time_stamp()}: Fetching data.')
    url = config['ztf_ops']['url']
    r = requests.get(url,
                     auth=(config['ztf_ops']['username'],
                           config['ztf_ops']['password']),
                     verify=False)
    if r.status_code == requests.codes.ok:
        with open(os.path.join(config['path']['tmp'], 'allexp.tbl'),
                  'wb') as f:
            f.write(r.content)
    else:
        raise Exception(f'{time_stamp()}: Failed to fetch allexp.tbl')

    latest = list(mongo.db[collection].find({},
                                            sort=[["$natural", -1]],
                                            limit=1))

    print(f'{time_stamp()}: Loading data.')
    df = pd.read_fwf(os.path.join(config['path']['tmp'], 'allexp.tbl'),
                     comment='|',
                     header=None,
                     names=[
                         'utc_start', 'sun_elevation', 'exp', 'filter', 'type',
                         'field', 'pid', 'ra', 'dec', 'slew', 'wait',
                         'fileroot', 'programpi', 'qcomment'
                     ])

    # drop comments:
    comments = df['utc_start'] == 'UT_START'
    df = df.loc[~comments]

    for col in ['sun_elevation', 'exp', 'filter', 'field', 'pid']:
        df[col] = df[col].apply(lambda x: int(x))
    for col in ['ra', 'dec', 'slew', 'wait']:
        df[col] = df[col].apply(lambda x: float(x))

    df['utc_start'] = df['utc_start'].apply(
        lambda x: datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f'))
    df['utc_end'] = df['utc_start'].add(
        df['exp'].apply(lambda x: datetime.timedelta(seconds=x)))

    df['jd_start'] = df['utc_start'].apply(lambda x: datetime_to_jd(x))
    df['jd_end'] = df['utc_end'].apply(lambda x: datetime_to_jd(x))

    # drop rows with utc_start <= c['utc_start]
    if len(latest) > 0:
        new = df['jd_start'] > latest[0].get('jd_start', 0)

        if sum(new):
            print(f'{time_stamp()}: Found {sum(new)} new records.')
            df = df.loc[new]
        else:
            # no new data? take a nap...
            print(f'{time_stamp()}: No new data found.')
            # close connection to db
            mongo.client.close()
            print(f'{time_stamp()}: Disconnected from db.')
            return

    documents = df.to_dict('records')
    documents = [mongify(doc) for doc in documents]

    print(f'{time_stamp()}: Inserting {len(documents)} documents.')

    mongo.insert_many(collection=collection, documents=documents)

    # close connection to db
    mongo.client.close()
    print(f'{time_stamp()}: Disconnected from db.')

示例#2

显示文件

文件： ingest_vlass.py 项目： mcoughlin/kowalski

def process_file(file, collection, batch_size):

    # connect to MongoDB:
    log("Connecting to DB")
    mongo = Mongo(
        host=config["database"]["host"],
        port=config["database"]["port"],
        replica_set=config["database"]["replica_set"],
        username=config["database"]["username"],
        password=config["database"]["password"],
        db=config["database"]["db"],
        verbose=0,
    )
    log("Successfully connected")

    collection = "VLASS_DR1"

    log(f"Processing {file}")

    names = [
        "Component_name",
        "RA",
        "DEC",
        "E_RA",
        "E_DEC",
        "Total_flux",
        "E_Total_flux",
        "Peak_flux",
        "E_Peak_flux",
        "Maj",
        "E_Maj",
        "Min",
        "E_Min",
        "Duplicate_flag",
        "Quality_flag",
    ]

    for chunk_index, dataframe_chunk in enumerate(
        pd.read_csv(file, chunksize=batch_size)
    ):

        log(f"{file}: processing batch # {chunk_index + 1}")

        dataframe_chunk = dataframe_chunk[names]
        dataframe_chunk = dataframe_chunk[dataframe_chunk["Duplicate_flag"] < 2]
        dataframe_chunk = dataframe_chunk[dataframe_chunk["Quality_flag"] == 0]

        batch = dataframe_chunk.to_dict(orient="records")

        bad_document_indexes = []

        for document_index, document in enumerate(batch):
            try:
                # GeoJSON for 2D indexing
                document["coordinates"] = dict()
                # string format: H:M:S, D:M:S
                document["coordinates"]["radec_str"] = [
                    deg2hms(document["RA"]),
                    deg2dms(document["DEC"]),
                ]
                # for GeoJSON, must be lon:[-180, 180], lat:[-90, 90] (i.e. in deg)
                _radec_geojson = [document["RA"] - 180.0, document["DEC"]]
                document["coordinates"]["radec_geojson"] = {
                    "type": "Point",
                    "coordinates": _radec_geojson,
                }
            except Exception as e:
                log(str(e))
                bad_document_indexes.append(document_index)

        if len(bad_document_indexes) > 0:
            log("Removing bad docs")
            for index in sorted(bad_document_indexes, reverse=True):
                del batch[index]

        # ingest batch
        mongo.insert_many(collection=collection, documents=batch)

    # disconnect from db:
    try:
        mongo.client.close()
    finally:
        log("Successfully disconnected from db")

示例#3

显示文件

文件： ingest_igaps.py 项目： mcoughlin/kowalski

def process_file(file, collection, batch_size):

    # connect to MongoDB:
    log("Connecting to DB")
    mongo = Mongo(
        host=config["database"]["host"],
        port=config["database"]["port"],
        replica_set=config["database"]["replica_set"],
        username=config["database"]["username"],
        password=config["database"]["password"],
        db=config["database"]["db"],
        verbose=0,
    )
    log("Successfully connected")

    collection = "IGAPS_DR2"

    log(f"Processing {file}")

    names = [
        "name",
        "RA",
        "DEC",
        "gal_long",
        "gal_lat",
        "sourceID",
        "posErr",
        "mergedClass",
        "pStar",
        "pGalaxy",
        "pNoise",
        "i",
        "iErr",
        "iAB",
        "iEll",
        "iClass",
        "iDeblend",
        "iSaturated",
        "iVignetted",
        "iTrail",
        "iTruncated",
        "iBadPix",
        "iMJD",
        "iSeeing",
        "iDetectionID",
        "iDeltaRA",
        "iDeltaDEC",
        "ha",
        "haErr",
        "haAB",
        "haEll",
        "haClass",
        "haDeblend",
        "haSaturated",
        "haVignetted",
        "haTrail",
        "haTruncated",
        "haBadPix",
        "haMJD",
        "haSeeing",
        "haDetectionID",
        "haDeltaRA",
        "haDeltaDEC",
        "r_I",
        "rErr_I",
        "rAB_I",
        "rEll_I",
        "rClass_I",
        "rDeblend_I",
        "rSaturated_I",
        "rVignetted_I",
        "rTrail_I",
        "rTruncated_I",
        "rBadPix_I",
        "rMJD_I",
        "rSeeing_I",
        "rDetectionID_I",
        "r_U",
        "rErr_U",
        "rAB_U",
        "rEll_U",
        "rClass_U",
        "rDeblend_U",
        "rSaturated_U",
        "rVignetted_U",
        "rTrail_U",
        "rTruncated_U",
        "rBadPix_U",
        "rMJD_U",
        "rSeeing_U",
        "rDetectionID_U",
        "rDeltaRA_U",
        "rDeltaDEC_U",
        "g",
        "gErr",
        "gAB",
        "gEll",
        "gClass",
        "gDeblend",
        "gSaturated",
        "gVignetted",
        "gTrail",
        "gTruncated",
        "gBadPix",
        "gmask",
        "gMJD",
        "gSeeing",
        "gDetectionID",
        "gDeltaRA",
        "gDeltaDEC",
        "U_RGO",
        "UErr",
        "UEll",
        "UClass",
        "UDeblend",
        "USaturated",
        "UVignetted",
        "UTrail",
        "UTruncated",
        "UBadPix",
        "UMJD",
        "USeeing",
        "UDetectionID",
        "UDeltaRA",
        "UDeltaDEC",
        "brightNeighb",
        "deblend",
        "saturated",
        "nBands",
        "errBits",
        "nObs_I",
        "nObs_U",
        "fieldID_I",
        "fieldID_U",
        "fieldGrade_I",
        "fieldGrade_U",
        "emitter",
        "variable",
        "SourceID2",
        "i2",
        "i2Err",
        "i2Class",
        "i2Seeing",
        "i2MJD",
        "i2DeltaRA",
        "i2DeltaDEC",
        "i2DetectionID",
        "i2ErrBits",
        "ha2",
        "ha2Err",
        "ha2Class",
        "ha2Seeing",
        "ha2MJD",
        "ha2DeltaRA",
        "ha2DeltaDEC",
        "ha2DetectionID",
        "ha2ErrBits",
        "r2_I",
        "r2Err_I",
        "r2Class_I",
        "r2Seeing_I",
        "r2MJD_I",
        "r2DeltaRA_I",
        "r2DeltaDEC_I",
        "r2DetectionID_I",
        "r2ErrBits_I",
        "r2_U",
        "r2Err_U",
        "r2Class_U",
        "r2Seeing_U",
        "r2MJD_U",
        "r2DeltaRA_U",
        "r2DeltaDEC_U",
        "r2DetectionID_U",
        "r2ErrBits_U",
        "g2",
        "g2Err",
        "g2Class",
        "g2Seeing",
        "g2MJD",
        "g2DeltaRA",
        "g2DeltaDEC",
        "g2DetectionID",
        "g2ErrBits",
        "U_RGO2",
        "U2Err",
        "U2Class",
        "U2Seeing",
        "U2MJD",
        "U2DeltaRA",
        "U2DeltaDEC",
        "U2DetectionID",
        "U2ErrBits",
        "errBits2",
    ]
    with fits.open(file) as hdulist:
        nhdu = 1
        dataframe = pd.DataFrame(np.asarray(hdulist[nhdu].data), columns=names)

    for chunk_index, dataframe_chunk in dataframe.groupby(
            np.arange(len(dataframe)) // batch_size):

        log(f"{file}: processing batch # {chunk_index + 1}")

        for col, dtype in dataframe_chunk.dtypes.items():
            if dtype == np.object:
                dataframe_chunk[col] = dataframe_chunk[col].apply(
                    lambda x: x.decode("utf-8"))

        batch = dataframe_chunk.to_dict(orient="records")
        batch = dataframe_chunk.fillna("DROPMEPLEASE").to_dict(
            orient="records")

        # pop nulls - save space
        batch = [{
            key: value
            for key, value in document.items() if value != "DROPMEPLEASE"
        } for document in batch]

        bad_document_indexes = []

        for document_index, document in enumerate(batch):
            try:
                # GeoJSON for 2D indexing
                document["coordinates"] = dict()
                # string format: H:M:S, D:M:S
                document["coordinates"]["radec_str"] = [
                    deg2hms(document["RA"]),
                    deg2dms(document["DEC"]),
                ]
                # for GeoJSON, must be lon:[-180, 180], lat:[-90, 90] (i.e. in deg)
                _radec_geojson = [document["RA"] - 180.0, document["DEC"]]
                document["coordinates"]["radec_geojson"] = {
                    "type": "Point",
                    "coordinates": _radec_geojson,
                }
            except Exception as e:
                log(str(e))
                bad_document_indexes.append(document_index)

        if len(bad_document_indexes) > 0:
            log("Removing bad docs")
            for index in sorted(bad_document_indexes, reverse=True):
                del batch[index]

        # ingest batch
        mongo.insert_many(collection=collection, documents=batch)

    # disconnect from db:
    try:
        mongo.client.close()
    finally:
        log("Successfully disconnected from db")

示例#4

显示文件

文件： ingest_gaia_edr3.py 项目： xx70235/kowalski

def process_file(args):
    file, collection, batch_size, rm, verbose = args
    # connect to MongoDB:
    if verbose:
        log("Connecting to DB")
    mongo = Mongo(
        host=config["database"]["host"],
        port=config["database"]["port"],
        replica_set=config["database"]["replica_set"],
        username=config["database"]["username"],
        password=config["database"]["password"],
        db=config["database"]["db"],
        verbose=0,
    )
    if verbose:
        log("Successfully connected")

    collection = "Gaia_EDR3"

    if verbose:
        log(f"Processing {file}")

    for chunk_index, dataframe_chunk in enumerate(
            pd.read_csv(file, chunksize=batch_size)):

        if verbose:
            log(f"{file}: processing batch # {chunk_index + 1}")

        dataframe_chunk["_id"] = dataframe_chunk["source_id"].apply(
            lambda x: str(x))

        batch = dataframe_chunk.fillna("DROPMEPLEASE").to_dict(
            orient="records")

        # pop nulls - save space
        batch = [{
            key: value
            for key, value in document.items()
            if value not in ("DROPMEPLEASE", "NOT_AVAILABLE")
        } for document in batch]

        bad_document_indexes = []

        for document_index, document in enumerate(batch):
            try:
                # GeoJSON for 2D indexing
                document["coordinates"] = dict()
                # string format: H:M:S, D:M:S
                document["coordinates"]["radec_str"] = [
                    deg2hms(document["ra"]),
                    deg2dms(document["dec"]),
                ]
                # for GeoJSON, must be lon:[-180, 180], lat:[-90, 90] (i.e. in deg)
                _radec_geojson = [document["ra"] - 180.0, document["dec"]]
                document["coordinates"]["radec_geojson"] = {
                    "type": "Point",
                    "coordinates": _radec_geojson,
                }
            except Exception as e:
                if verbose:
                    log(str(e))
                bad_document_indexes.append(document_index)

        if len(bad_document_indexes) > 0:
            if verbose:
                log("Removing bad docs")
            for index in sorted(bad_document_indexes, reverse=True):
                del batch[index]

        # ingest batch
        mongo.insert_many(collection=collection, documents=batch)

    # disconnect from db:
    try:
        mongo.client.close()
    finally:
        if verbose:
            log("Successfully disconnected from db")

    # clean up:
    if rm:
        os.remove(file)
        if verbose:
            log(f"Successfully removed {file}")

示例#5

显示文件

文件： ops_watcher_ztf.py 项目： xx70235/kowalski

def get_ops():
    """
    Fetch and ingest ZTF ops data
    """
    # connect to MongoDB:
    print(f"{time_stamp()}: Connecting to DB.")
    mongo = Mongo(
        host=config["database"]["host"],
        port=config["database"]["port"],
        replica_set=config["database"]["replica_set"],
        username=config["database"]["username"],
        password=config["database"]["password"],
        db=config["database"]["db"],
        verbose=0,
    )
    print(f"{time_stamp()}: Successfully connected.")

    collection = "ZTF_ops"

    print(f"{time_stamp()}: Checking indexes.")
    mongo.db[collection].create_index(
        [("coordinates.radec_geojson", "2dsphere")], background=True)
    mongo.db[collection].create_index(
        [
            ("utc_start", pymongo.ASCENDING),
            ("utc_end", pymongo.ASCENDING),
            ("fileroot", pymongo.ASCENDING),
        ],
        background=True,
    )
    mongo.db[collection].create_index(
        [
            ("jd_start", pymongo.ASCENDING),
            ("jd_end", pymongo.ASCENDING),
            ("fileroot", pymongo.ASCENDING),
        ],
        background=True,
    )
    mongo.db[collection].create_index(
        [
            ("jd_start", pymongo.DESCENDING),
            ("pid", pymongo.ASCENDING),
            ("field", pymongo.ASCENDING),
        ],
        background=True,
    )

    # fetch full table
    print(f"{time_stamp()}: Fetching data.")
    url = config["ztf_ops"]["url"]
    r = requests.get(
        url,
        auth=(config["ztf_ops"]["username"], config["ztf_ops"]["password"]),
        verify=False,
    )
    if r.status_code == requests.codes.ok:
        with open(os.path.join(config["path"]["tmp"], "allexp.tbl"),
                  "wb") as f:
            f.write(r.content)
    else:
        raise Exception(f"{time_stamp()}: Failed to fetch allexp.tbl")

    latest = list(mongo.db[collection].find({},
                                            sort=[["$natural", -1]],
                                            limit=1))

    print(f"{time_stamp()}: Loading data.")
    df = pd.read_fwf(
        os.path.join(config["path"]["tmp"], "allexp.tbl"),
        comment="|",
        widths=[22, 4, 6, 4, 5, 8, 4, 9, 9, 7, 8, 29, 11, 25],
        header=None,
        names=[
            "utc_start",
            "sun_elevation",
            "exp",
            "filter",
            "type",
            "field",
            "pid",
            "ra",
            "dec",
            "slew",
            "wait",
            "fileroot",
            "programpi",
            "qcomment",
        ],
    )

    # drop comments:
    comments = df["utc_start"] == "UT_START"
    df = df.loc[~comments]

    for col in ["sun_elevation", "exp", "filter", "field", "pid"]:
        df[col] = df[col].apply(lambda x: int(x))
    for col in ["ra", "dec", "slew", "wait"]:
        df[col] = df[col].apply(lambda x: float(x))

    df["utc_start"] = df["utc_start"].apply(
        lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%f"))
    df["utc_end"] = df["utc_start"].add(
        df["exp"].apply(lambda x: datetime.timedelta(seconds=x)))

    df["jd_start"] = df["utc_start"].apply(lambda x: datetime_to_jd(x))
    df["jd_end"] = df["utc_end"].apply(lambda x: datetime_to_jd(x))

    # drop rows with utc_start <= c['utc_start]
    if len(latest) > 0:
        new = df["jd_start"] > latest[0].get("jd_start", 0)

        if sum(new):
            print(f"{time_stamp()}: Found {sum(new)} new records.")
            df = df.loc[new]
        else:
            # no new data? take a nap...
            print(f"{time_stamp()}: No new data found.")
            # close connection to db
            mongo.client.close()
            print(f"{time_stamp()}: Disconnected from db.")
            return

    documents = df.to_dict("records")
    documents = [mongify(doc) for doc in documents]

    print(f"{time_stamp()}: Inserting {len(documents)} documents.")

    mongo.insert_many(collection=collection, documents=documents)

    # close connection to db
    mongo.client.close()
    print(f"{time_stamp()}: Disconnected from db.")

示例#6

显示文件

def process_file(argument_list: Sequence):
    file_name, collections, batch_size, rm_file, dry_run = argument_list
    try:
        # connect to MongoDB:
        mongo = Mongo(
            host=config["database"]["host"],
            port=config["database"]["port"],
            replica_set=config["database"]["replica_set"],
            username=config["database"]["username"],
            password=config["database"]["password"],
            db=config["database"]["db"],
            verbose=0,
        )

        with tables.open_file(file_name, "r+") as f:
            for group in f.walk_groups():
                pass

            ff_basename = pathlib.Path(file_name).name
            # base id:
            _, field, filt, ccd, _, _ = ff_basename.split("_")
            field = int(field[1:])
            filt = int(filt[1:])
            readout_channel = int(ccd[1:])
            baseid = int(1e13 + field * 1e9 + readout_channel * 1e7 +
                         filt * 1e6)
            exposure_baseid = int(1e16 + field * 1e12 +
                                  readout_channel * 1e10 + filt * 1e9)

            def clean_up_document(group):
                """ Format passed in dicts for Mongo insertion """
                document = {}
                for k, v in group.items():
                    if k == "matchedSourceID":
                        document[k] = group[k]
                        continue
                    if k in sources_int_fields:
                        document[k] = [
                            int(group[k][key2]) for key2 in group[k].keys()
                        ]
                    else:
                        document[k] = [
                            float(group[k][key2]) for key2 in group[k].keys()
                        ]

                # document["ra"] = document["ra"][0]
                # document["dec"] = document["dec"][0]

                # generate unique _id:
                document["_id"] = baseid + document["matchedSourceID"]
                document["filter"] = filt
                document["field"] = field
                document["ccd"] = ccd

                # GeoJSON for 2D indexing
                document["coordinates"] = dict()
                _ra = np.median(document["ra"])
                _dec = np.median(document["dec"])
                _radec_str = [deg2hms(_ra), deg2dms(_dec)]
                document["coordinates"]["radec_str"] = _radec_str
                # for GeoJSON, must be lon:[-180, 180], lat:[-90, 90] (i.e. in deg)
                _radec_geojson = [_ra - 180.0, _dec]
                document["coordinates"]["radec_geojson"] = {
                    "type": "Point",
                    "coordinates": _radec_geojson,
                }

                document["data"] = []
                for t, m, e, f, _ra, _dec in zip(
                        document["mjd"],
                        document["mag"],
                        document["magErr"],
                        document["ipacFlags"],
                        document["ra"],
                        document["dec"],
                ):
                    data_point = {
                        "mjd": t,
                        "mag": m,
                        "magerr": e,
                        "ipacflags": f,
                        "ra": _ra,
                        "dec": _dec,
                    }
                    # convert types for pymongo:
                    for k, v in data_point.items():
                        if k in sourcedata_int_fields:
                            data_point[k] = int(data_point[k])
                        else:
                            data_point[k] = float(data_point[k])
                            if k == "mjd":
                                data_point[k] = round(data_point[k], 5)
                            elif k not in ("ra", "dec"):
                                data_point[k] = round(data_point[k], 3)
                    document["data"].append(data_point)
                del (
                    document["mjd"],
                    document["mag"],
                    document["magErr"],
                    document["ipacFlags"],
                    document["ra"],
                    document["dec"],
                )
                document["data"].sort(key=lambda x: x["mjd"])

                return document

            exposures = pd.DataFrame.from_records(group.exposures[:])
            # prepare docs to ingest into db:
            docs_exposures = []
            for index, row in exposures.iterrows():
                try:
                    doc = row.to_dict()
                    # unique exposure id:
                    doc["_id"] = exposure_baseid + doc["expid"]
                    doc["matchfile"] = ff_basename
                    doc["filter"] = filt
                    doc["field"] = field
                    doc["ccd"] = ccd
                    docs_exposures.append(doc)
                except Exception as exception:
                    log(str(exception))

            # ingest exposures in one go:
            if not dry_run:
                mongo.insert_many(collection=collections["exposures"],
                                  documents=docs_exposures)

            sources = pd.DataFrame.from_records(
                group["sources"].read(),
                index="matchedSourceID",
                exclude=sources_fields_to_exclude,
            )
            sourcedatas = pd.DataFrame.from_records(
                group["sourcedata"].read(),
                index="matchedSourceID",
                exclude=sourcedata_to_exclude,
            )

            merged = sources.merge(sourcedatas,
                                   left_index=True,
                                   right_index=True)
            groups = merged.groupby("matchedSourceID")

            # light curves
            docs_sources = []
            batch_num = 1
            # fixme? skip transients
            for row, group in groups:
                try:
                    groupdict = group.to_dict()
                    groupdict["matchedSourceID"] = row
                    current_doc = clean_up_document(groupdict)
                    docs_sources.append(current_doc)
                except Exception as exception:
                    log(str(exception))

                # ingest in batches
                try:
                    if len(docs_sources) % batch_size == 0 and len(
                            docs_sources) != 0:
                        if not dry_run:
                            mongo.insert_many(
                                collection=collections["sources"],
                                documents=docs_sources,
                            )
                        # flush:
                        docs_sources = []
                        batch_num += 1
                except Exception as exception:
                    log(str(exception))

        # ingest remaining
        while len(docs_sources) > 0:
            try:
                # In case mongo crashed and disconnected, docs will accumulate in documents
                # keep on trying to insert them until successful
                if not dry_run:
                    mongo.insert_many(collection=collections["sources"],
                                      documents=docs_sources)
                    # flush:
                    docs_sources = []

            except Exception as e:
                traceback.print_exc()
                log(e)
                log("Failed, waiting 5 seconds to retry")
                time.sleep(5)

        mongo.client.close()

    except Exception as e:
        traceback.print_exc()
        log(e)
        # if there was an error, return without potentially deleting the file
        return

    try:
        if rm_file:
            os.remove(file_name)
    finally:
        pass

示例#7

显示文件

文件： ingest_ztf_matchfiles.py 项目： mcoughlin/kowalski

def process_file(argument_list: Sequence):
    file_name, collections, batch_size, rm_file, dry_run = argument_list
    try:
        # connect to MongoDB:
        mongo = Mongo(
            host=config["database"]["host"],
            port=config["database"]["port"],
            replica_set=config["database"]["replica_set"],
            username=config["database"]["username"],
            password=config["database"]["password"],
            db=config["database"]["db"],
            verbose=0,
        )

        with tables.open_file(file_name, "r+") as f:
            group = f.root.matches
            ff_basename = pathlib.Path(file_name).name
            # base id:
            _, field, filt, ccd, quadrant, _ = ff_basename.split("_")
            field = int(field)
            filt = filters[filt]
            ccd = int(ccd[1:])
            quadrant = int(quadrant[1:])
            readout_channel = ccd_quad_to_rc(ccd=ccd, quad=quadrant)
            baseid = int(1e13 + field * 1e9 + readout_channel * 1e7 +
                         filt * 1e6)
            exposure_baseid = int(1e16 + field * 1e12 +
                                  readout_channel * 1e10 + filt * 1e9)

            def clean_up_document(document):
                """ Format passed in dicts for Mongo insertion """
                # convert types for pymongo:
                for k, v in document.items():
                    if k != "data":
                        if k in sources_int_fields:
                            document[k] = int(document[k])
                        else:
                            document[k] = float(document[k])
                            if k not in ("ra", "dec"):
                                # this will save a lot of space:
                                document[k] = round(document[k], 3)

                # generate unique _id:
                document["_id"] = baseid + document["matchid"]
                document["filter"] = filt
                document["field"] = field
                document["ccd"] = ccd
                document["quad"] = quadrant
                document["rc"] = readout_channel

                # GeoJSON for 2D indexing
                document["coordinates"] = dict()
                _ra = document["ra"]
                _dec = document["dec"]
                _radec_str = [deg2hms(_ra), deg2dms(_dec)]
                document["coordinates"]["radec_str"] = _radec_str
                # for GeoJSON, must be lon:[-180, 180], lat:[-90, 90] (i.e. in deg)
                _radec_geojson = [_ra - 180.0, _dec]
                document["coordinates"]["radec_geojson"] = {
                    "type": "Point",
                    "coordinates": _radec_geojson,
                }
                document["data"].sort(key=lambda x: x["hjd"])
                for data_point in document["data"]:
                    # convert types for pymongo:
                    for k, v in data_point.items():
                        if k in sourcedata_int_fields:
                            data_point[k] = int(data_point[k])
                        else:
                            data_point[k] = float(data_point[k])
                            if k not in ("ra", "dec", "hjd"):
                                data_point[k] = round(data_point[k], 3)
                            elif k == "hjd":
                                data_point[k] = round(data_point[k], 5)
                    # generate unique exposure id's that match _id's in exposures collection
                    data_point[
                        "uexpid"] = exposure_baseid + data_point["expid"]

                return document

            exposures = pd.DataFrame.from_records(group.exposures[:])
            # prepare docs to ingest into db:
            docs_exposures = []
            for index, row in exposures.iterrows():
                try:
                    doc = row.to_dict()
                    # unique exposure id:
                    doc["_id"] = exposure_baseid + doc["expid"]
                    doc["matchfile"] = ff_basename
                    doc["filter"] = filt
                    doc["field"] = field
                    doc["ccd"] = ccd
                    doc["quad"] = quadrant
                    doc["rc"] = readout_channel
                    docs_exposures.append(doc)
                except Exception as exception:
                    log(str(exception))

            # ingest exposures in one go:
            if not dry_run:
                mongo.insert_many(collection=collections["exposures"],
                                  documents=docs_exposures)

            # light curves
            docs_sources = []
            batch_num = 1
            # fixme? skip transients
            # for source_type in ('source', 'transient'):
            for source_type in ("source", ):
                sources = pd.DataFrame.from_records(
                    group[f"{source_type}s"].read(),
                    index="matchid",
                    exclude=sources_fields_to_exclude,
                )
                # Load in percentiles separately to compute the IQR column
                # because Pandas DF from_records() only wants 2-D tables
                percentiles = group[f"{source_type}s"].col("percentiles")
                # Ignore float errors due to infinity values
                old_settings = np.seterr(all="ignore")
                iqr = np.round(percentiles[:, 8] - percentiles[:, 3], 3)
                np.seterr(**old_settings)
                sources["iqr"] = iqr

                sourcedatas = pd.DataFrame.from_records(
                    group[f"{source_type}data"][:],
                    index="matchid",
                    exclude=[
                        "ypos",
                        "xpos",
                        "mjd",
                        "psfflux",
                        "psffluxerr",
                        "magerrmodel",
                    ],
                )
                sourcedatas.rename(columns={
                    "ra": "ra_data",
                    "dec": "dec_data"
                },
                                   inplace=True)
                sourcedata_colnames = sourcedatas.columns.values
                # Join sources and their data
                merged = sources.merge(sourcedatas,
                                       left_index=True,
                                       right_index=True)
                prev_matchid = None
                current_doc = None
                for row in merged.itertuples():
                    matchid = row[0]
                    try:
                        # At a new source
                        if matchid != prev_matchid:
                            # Done with last source; save
                            if current_doc is not None:
                                current_doc = clean_up_document(current_doc)
                                docs_sources.append(current_doc)

                            # Set up new doc
                            doc = dict(row._asdict())
                            doc["matchid"] = doc["Index"]
                            doc.pop("Index")
                            # Coerce the source data info into its own nested array
                            first_data_row = {}
                            for col in sourcedata_colnames:
                                if col not in ["dec_data", "ra_data"]:
                                    first_data_row[col] = doc[col]
                                else:
                                    real_col = col.split("_data")[0]
                                    first_data_row[real_col] = doc[col]
                                doc.pop(col)
                            doc["data"] = [first_data_row]
                            current_doc = doc
                        # For continued source, just append new data row
                        else:
                            data_row = {}
                            data = dict(row._asdict())
                            for col in sourcedata_colnames:
                                if col not in ["dec_data", "ra_data"]:
                                    data_row[col] = data[col]
                                else:
                                    real_col = col.split("_data")[0]
                                    data_row[real_col] = data[col]

                            current_doc["data"].append(data_row)

                        prev_matchid = matchid

                    except Exception as exception:
                        log(str(exception))

                    # ingest in batches
                    try:
                        if (len(docs_sources) % batch_size == 0
                                and len(docs_sources) != 0):
                            if not dry_run:
                                mongo.insert_many(
                                    collection=collections["sources"],
                                    documents=docs_sources,
                                )
                            # flush:
                            docs_sources = []
                            batch_num += 1
                    except Exception as exception:
                        log(str(exception))

        # Clean up and append the last doc
        if current_doc is not None:
            current_doc = clean_up_document(current_doc)
            docs_sources.append(current_doc)

        # ingest remaining
        while len(docs_sources) > 0:
            try:
                # In case mongo crashed and disconnected, docs will accumulate in documents
                # keep on trying to insert them until successful
                if not dry_run:
                    mongo.insert_many(collection=collections["sources"],
                                      documents=docs_sources)
                    # flush:
                    docs_sources = []

            except Exception as e:
                traceback.print_exc()
                log(e)
                log("Failed, waiting 5 seconds to retry")
                time.sleep(5)

        mongo.client.close()

    except Exception as e:
        traceback.print_exc()
        log(e)
        # if there was an error, return without potentially deleting the file
        return

    try:
        if rm_file:
            os.remove(file_name)
    finally:
        pass