Python Mongo 예제들, utils.Mongo Python 예제들

예제 #1

0

파일 보기

파일: test_tns_watcher.py 프로젝트: xx70235/kowalski

    def test_tns_watcher(self):
        log("Connecting to DB")
        mongo = Mongo(
            host=config["database"]["host"],
            port=config["database"]["port"],
            replica_set=config["database"]["replica_set"],
            username=config["database"]["username"],
            password=config["database"]["password"],
            db=config["database"]["db"],
            verbose=True,
        )
        log("Successfully connected")

        collection = config["database"]["collections"]["tns"]

        log("Grabbing 1 page with 5 entries from the TNS and ingesting that into the database"
            )
        get_tns(
            grab_all=False,
            num_pages=1,
            entries_per_page=5,
        )
        log("Done")

        fetched_entries = list(mongo.db[collection].find({}, {"_id": 1}))

        assert len(fetched_entries) > 0

예제 #2

0

파일 보기

파일: test_tools.py 프로젝트: janvanroestel/kowalski

def mongo_fixture(request):
    log("Connecting to DB")
    mongo = Mongo(
        host=config["database"]["host"],
        port=config["database"]["port"],
        replica_set=config["database"]["replica_set"],
        username=config["database"]["username"],
        password=config["database"]["password"],
        db=config["database"]["db"],
        verbose=True,
    )
    log("Successfully connected")

    request.cls.mongo = mongo

예제 #3

0

파일 보기

파일: ingest_igaps.py 프로젝트: mcoughlin/kowalski

def run(
    path: str = "./",
    num_processes: int = 1,
    batch_size: int = 2048,
):
    """Pre-process and ingest IGAPS catalog
    :param path: path to CSV data files (~98 GB tarred)
                 see http://www.star.ucl.ac.uk/IGAPS/catalogue/
    :param num_processes:
    :return:
    """

    files = glob.glob(os.path.join(path, "igaps-*.fits.gz"))

    catalog_name = "IGAPS_DR2"

    log("Connecting to DB")
    m = Mongo(
        host=config["database"]["host"],
        port=config["database"]["port"],
        replica_set=config["database"]["replica_set"],
        username=config["database"]["username"],
        password=config["database"]["password"],
        db=config["database"]["db"],
    )
    log("Successfully connected")

    # Create indexes in the database:
    log("Creating indexes")
    # 2D position on the sphere, ID:
    m.db[catalog_name].create_index([("coordinates.radec_geojson", "2dsphere"),
                                     ("_id", 1)],
                                    background=True)

    input_list = [(f, catalog_name, batch_size) for f in files]

    with mp.Pool(processes=num_processes) as p:
        for _ in tqdm(p.istarmap(process_file, input_list),
                      total=len(input_list)):
            pass

예제 #4

0

파일 보기

파일: ingest_vlass.py 프로젝트: mcoughlin/kowalski

def process_file(file, collection, batch_size):

    # connect to MongoDB:
    log("Connecting to DB")
    mongo = Mongo(
        host=config["database"]["host"],
        port=config["database"]["port"],
        replica_set=config["database"]["replica_set"],
        username=config["database"]["username"],
        password=config["database"]["password"],
        db=config["database"]["db"],
        verbose=0,
    )
    log("Successfully connected")

    collection = "VLASS_DR1"

    log(f"Processing {file}")

    names = [
        "Component_name",
        "RA",
        "DEC",
        "E_RA",
        "E_DEC",
        "Total_flux",
        "E_Total_flux",
        "Peak_flux",
        "E_Peak_flux",
        "Maj",
        "E_Maj",
        "Min",
        "E_Min",
        "Duplicate_flag",
        "Quality_flag",
    ]

    for chunk_index, dataframe_chunk in enumerate(
        pd.read_csv(file, chunksize=batch_size)
    ):

        log(f"{file}: processing batch # {chunk_index + 1}")

        dataframe_chunk = dataframe_chunk[names]
        dataframe_chunk = dataframe_chunk[dataframe_chunk["Duplicate_flag"] < 2]
        dataframe_chunk = dataframe_chunk[dataframe_chunk["Quality_flag"] == 0]

        batch = dataframe_chunk.to_dict(orient="records")

        bad_document_indexes = []

        for document_index, document in enumerate(batch):
            try:
                # GeoJSON for 2D indexing
                document["coordinates"] = dict()
                # string format: H:M:S, D:M:S
                document["coordinates"]["radec_str"] = [
                    deg2hms(document["RA"]),
                    deg2dms(document["DEC"]),
                ]
                # for GeoJSON, must be lon:[-180, 180], lat:[-90, 90] (i.e. in deg)
                _radec_geojson = [document["RA"] - 180.0, document["DEC"]]
                document["coordinates"]["radec_geojson"] = {
                    "type": "Point",
                    "coordinates": _radec_geojson,
                }
            except Exception as e:
                log(str(e))
                bad_document_indexes.append(document_index)

        if len(bad_document_indexes) > 0:
            log("Removing bad docs")
            for index in sorted(bad_document_indexes, reverse=True):
                del batch[index]

        # ingest batch
        mongo.insert_many(collection=collection, documents=batch)

    # disconnect from db:
    try:
        mongo.client.close()
    finally:
        log("Successfully disconnected from db")

예제 #5

0

파일 보기

def get_ops():
    """
        Fetch and ingest ZTF ops data
    """

    # connect to MongoDB:
    print(f'{time_stamp()}: Connecting to DB.')
    mongo = Mongo(host=config['database']['host'],
                  port=config['database']['port'],
                  username=config['database']['username'],
                  password=config['database']['password'],
                  db=config['database']['db'],
                  verbose=0)
    print(f'{time_stamp()}: Successfully connected.')

    collection = 'ZTF_ops'

    print(f'{time_stamp()}: Checking indexes.')
    mongo.db[collection].create_index(
        [('coordinates.radec_geojson', '2dsphere')], background=True)
    mongo.db[collection].create_index([('utc_start', pymongo.ASCENDING),
                                       ('utc_end', pymongo.ASCENDING),
                                       ('fileroot', pymongo.ASCENDING)],
                                      background=True)
    mongo.db[collection].create_index([('jd_start', pymongo.ASCENDING),
                                       ('jd_end', pymongo.ASCENDING),
                                       ('fileroot', pymongo.ASCENDING)],
                                      background=True)
    mongo.db[collection].create_index([('jd_start', pymongo.DESCENDING),
                                       ('pid', pymongo.ASCENDING),
                                       ('field', pymongo.ASCENDING)],
                                      background=True)

    # fetch full table
    print(f'{time_stamp()}: Fetching data.')
    url = config['ztf_ops']['url']
    r = requests.get(url,
                     auth=(config['ztf_ops']['username'],
                           config['ztf_ops']['password']),
                     verify=False)
    if r.status_code == requests.codes.ok:
        with open(os.path.join(config['path']['tmp'], 'allexp.tbl'),
                  'wb') as f:
            f.write(r.content)
    else:
        raise Exception(f'{time_stamp()}: Failed to fetch allexp.tbl')

    latest = list(mongo.db[collection].find({},
                                            sort=[["$natural", -1]],
                                            limit=1))

    print(f'{time_stamp()}: Loading data.')
    df = pd.read_fwf(os.path.join(config['path']['tmp'], 'allexp.tbl'),
                     comment='|',
                     header=None,
                     names=[
                         'utc_start', 'sun_elevation', 'exp', 'filter', 'type',
                         'field', 'pid', 'ra', 'dec', 'slew', 'wait',
                         'fileroot', 'programpi', 'qcomment'
                     ])

    # drop comments:
    comments = df['utc_start'] == 'UT_START'
    df = df.loc[~comments]

    for col in ['sun_elevation', 'exp', 'filter', 'field', 'pid']:
        df[col] = df[col].apply(lambda x: int(x))
    for col in ['ra', 'dec', 'slew', 'wait']:
        df[col] = df[col].apply(lambda x: float(x))

    df['utc_start'] = df['utc_start'].apply(
        lambda x: datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f'))
    df['utc_end'] = df['utc_start'].add(
        df['exp'].apply(lambda x: datetime.timedelta(seconds=x)))

    df['jd_start'] = df['utc_start'].apply(lambda x: datetime_to_jd(x))
    df['jd_end'] = df['utc_end'].apply(lambda x: datetime_to_jd(x))

    # drop rows with utc_start <= c['utc_start]
    if len(latest) > 0:
        new = df['jd_start'] > latest[0].get('jd_start', 0)

        if sum(new):
            print(f'{time_stamp()}: Found {sum(new)} new records.')
            df = df.loc[new]
        else:
            # no new data? take a nap...
            print(f'{time_stamp()}: No new data found.')
            # close connection to db
            mongo.client.close()
            print(f'{time_stamp()}: Disconnected from db.')
            return

    documents = df.to_dict('records')
    documents = [mongify(doc) for doc in documents]

    print(f'{time_stamp()}: Inserting {len(documents)} documents.')

    mongo.insert_many(collection=collection, documents=documents)

    # close connection to db
    mongo.client.close()
    print(f'{time_stamp()}: Disconnected from db.')

예제 #6

0

파일 보기

파일: ingest_igaps.py 프로젝트: mcoughlin/kowalski

def process_file(file, collection, batch_size):

    # connect to MongoDB:
    log("Connecting to DB")
    mongo = Mongo(
        host=config["database"]["host"],
        port=config["database"]["port"],
        replica_set=config["database"]["replica_set"],
        username=config["database"]["username"],
        password=config["database"]["password"],
        db=config["database"]["db"],
        verbose=0,
    )
    log("Successfully connected")

    collection = "IGAPS_DR2"

    log(f"Processing {file}")

    names = [
        "name",
        "RA",
        "DEC",
        "gal_long",
        "gal_lat",
        "sourceID",
        "posErr",
        "mergedClass",
        "pStar",
        "pGalaxy",
        "pNoise",
        "i",
        "iErr",
        "iAB",
        "iEll",
        "iClass",
        "iDeblend",
        "iSaturated",
        "iVignetted",
        "iTrail",
        "iTruncated",
        "iBadPix",
        "iMJD",
        "iSeeing",
        "iDetectionID",
        "iDeltaRA",
        "iDeltaDEC",
        "ha",
        "haErr",
        "haAB",
        "haEll",
        "haClass",
        "haDeblend",
        "haSaturated",
        "haVignetted",
        "haTrail",
        "haTruncated",
        "haBadPix",
        "haMJD",
        "haSeeing",
        "haDetectionID",
        "haDeltaRA",
        "haDeltaDEC",
        "r_I",
        "rErr_I",
        "rAB_I",
        "rEll_I",
        "rClass_I",
        "rDeblend_I",
        "rSaturated_I",
        "rVignetted_I",
        "rTrail_I",
        "rTruncated_I",
        "rBadPix_I",
        "rMJD_I",
        "rSeeing_I",
        "rDetectionID_I",
        "r_U",
        "rErr_U",
        "rAB_U",
        "rEll_U",
        "rClass_U",
        "rDeblend_U",
        "rSaturated_U",
        "rVignetted_U",
        "rTrail_U",
        "rTruncated_U",
        "rBadPix_U",
        "rMJD_U",
        "rSeeing_U",
        "rDetectionID_U",
        "rDeltaRA_U",
        "rDeltaDEC_U",
        "g",
        "gErr",
        "gAB",
        "gEll",
        "gClass",
        "gDeblend",
        "gSaturated",
        "gVignetted",
        "gTrail",
        "gTruncated",
        "gBadPix",
        "gmask",
        "gMJD",
        "gSeeing",
        "gDetectionID",
        "gDeltaRA",
        "gDeltaDEC",
        "U_RGO",
        "UErr",
        "UEll",
        "UClass",
        "UDeblend",
        "USaturated",
        "UVignetted",
        "UTrail",
        "UTruncated",
        "UBadPix",
        "UMJD",
        "USeeing",
        "UDetectionID",
        "UDeltaRA",
        "UDeltaDEC",
        "brightNeighb",
        "deblend",
        "saturated",
        "nBands",
        "errBits",
        "nObs_I",
        "nObs_U",
        "fieldID_I",
        "fieldID_U",
        "fieldGrade_I",
        "fieldGrade_U",
        "emitter",
        "variable",
        "SourceID2",
        "i2",
        "i2Err",
        "i2Class",
        "i2Seeing",
        "i2MJD",
        "i2DeltaRA",
        "i2DeltaDEC",
        "i2DetectionID",
        "i2ErrBits",
        "ha2",
        "ha2Err",
        "ha2Class",
        "ha2Seeing",
        "ha2MJD",
        "ha2DeltaRA",
        "ha2DeltaDEC",
        "ha2DetectionID",
        "ha2ErrBits",
        "r2_I",
        "r2Err_I",
        "r2Class_I",
        "r2Seeing_I",
        "r2MJD_I",
        "r2DeltaRA_I",
        "r2DeltaDEC_I",
        "r2DetectionID_I",
        "r2ErrBits_I",
        "r2_U",
        "r2Err_U",
        "r2Class_U",
        "r2Seeing_U",
        "r2MJD_U",
        "r2DeltaRA_U",
        "r2DeltaDEC_U",
        "r2DetectionID_U",
        "r2ErrBits_U",
        "g2",
        "g2Err",
        "g2Class",
        "g2Seeing",
        "g2MJD",
        "g2DeltaRA",
        "g2DeltaDEC",
        "g2DetectionID",
        "g2ErrBits",
        "U_RGO2",
        "U2Err",
        "U2Class",
        "U2Seeing",
        "U2MJD",
        "U2DeltaRA",
        "U2DeltaDEC",
        "U2DetectionID",
        "U2ErrBits",
        "errBits2",
    ]
    with fits.open(file) as hdulist:
        nhdu = 1
        dataframe = pd.DataFrame(np.asarray(hdulist[nhdu].data), columns=names)

    for chunk_index, dataframe_chunk in dataframe.groupby(
            np.arange(len(dataframe)) // batch_size):

        log(f"{file}: processing batch # {chunk_index + 1}")

        for col, dtype in dataframe_chunk.dtypes.items():
            if dtype == np.object:
                dataframe_chunk[col] = dataframe_chunk[col].apply(
                    lambda x: x.decode("utf-8"))

        batch = dataframe_chunk.to_dict(orient="records")
        batch = dataframe_chunk.fillna("DROPMEPLEASE").to_dict(
            orient="records")

        # pop nulls - save space
        batch = [{
            key: value
            for key, value in document.items() if value != "DROPMEPLEASE"
        } for document in batch]

        bad_document_indexes = []

        for document_index, document in enumerate(batch):
            try:
                # GeoJSON for 2D indexing
                document["coordinates"] = dict()
                # string format: H:M:S, D:M:S
                document["coordinates"]["radec_str"] = [
                    deg2hms(document["RA"]),
                    deg2dms(document["DEC"]),
                ]
                # for GeoJSON, must be lon:[-180, 180], lat:[-90, 90] (i.e. in deg)
                _radec_geojson = [document["RA"] - 180.0, document["DEC"]]
                document["coordinates"]["radec_geojson"] = {
                    "type": "Point",
                    "coordinates": _radec_geojson,
                }
            except Exception as e:
                log(str(e))
                bad_document_indexes.append(document_index)

        if len(bad_document_indexes) > 0:
            log("Removing bad docs")
            for index in sorted(bad_document_indexes, reverse=True):
                del batch[index]

        # ingest batch
        mongo.insert_many(collection=collection, documents=batch)

    # disconnect from db:
    try:
        mongo.client.close()
    finally:
        log("Successfully disconnected from db")

예제 #7

0

파일 보기

파일: tns_watcher.py 프로젝트: kmshin1397/kowalski

def get_tns(grab_all: bool = False,
            num_pages: int = 10,
            entries_per_page: int = 100):
    """
    Queries the TNS and obtains the sources reported to it.

    :param grab_all: grab the complete database from TNS? takes a while!
    :param num_pages: grab the last <num_pages> pages
    :param entries_per_page: number of entries per page to grab
    :return:
    """

    # connect to MongoDB:
    log("Connecting to DB")
    mongo = Mongo(
        host=config["database"]["host"],
        port=config["database"]["port"],
        replica_set=config["database"]["replica_set"],
        username=config["database"]["username"],
        password=config["database"]["password"],
        db=config["database"]["db"],
        verbose=0,
    )
    log("Successfully connected")

    collection = config["database"]["collections"]["tns"]

    if config["database"]["build_indexes"]:
        log("Checking indexes")
        for index in config["database"]["indexes"][collection]:
            try:
                ind = [tuple(ii) for ii in index["fields"]]
                mongo.db[collection].create_index(
                    keys=ind,
                    name=index["name"],
                    background=True,
                    unique=index["unique"],
                )
            except Exception as e:
                log(e)

    log("Fetching data...")

    if grab_all:
        # grab the latest data (5 is the minimum):
        url = os.path.join(config["tns"]["url"],
                           "search?format=csv&num_page=5&page=0")
        data = pd.read_csv(url)
        num_pages = data["ID"].max() // entries_per_page

    for num_page in range(num_pages):
        log(f"Digesting page #{num_page+1} of {num_pages}...")
        url = os.path.join(
            config["tns"]["url"],
            f"search?format=csv&num_page={entries_per_page}&page={num_page}",
        )

        # 20210114: wis-tns.org has issues with their certificate
        csv_data = requests.get(url, allow_redirects=False, timeout=60).content
        data = pd.read_csv(io.StringIO(csv_data.decode("utf-8")))

        for index, row in data.iterrows():
            try:
                doc = mongify(row)
                doc_id = doc.pop("_id", None)
                if doc_id:
                    mongo.update_one(
                        collection=collection,
                        filt={"_id": doc_id},
                        update={"$set": doc},
                        upsert=True,
                    )
            except Exception as e:
                log(str(e))
                log(traceback.print_exc())

    # close connection to db
    mongo.client.close()
    log("Disconnected from db")

예제 #8

0

파일 보기

    def test_ingester(self):

        init_db_sync(config=config, verbose=True)

        log("Setting up paths")
        # path_kafka = pathlib.Path(config["path"]["kafka"])

        path_logs = pathlib.Path(config["path"]["logs"])
        if not path_logs.exists():
            path_logs.mkdir(parents=True, exist_ok=True)

        if config["misc"]["broker"]:
            log("Setting up test groups and filters in Fritz")
            program = Program(group_name="FRITZ_TEST", group_nickname="test")
            Filter(
                collection="ZTF_alerts",
                group_id=program.group_id,
                filter_id=program.filter_id,
            )

            program2 = Program(group_name="FRITZ_TEST_AUTOSAVE", group_nickname="test2")
            Filter(
                collection="ZTF_alerts",
                group_id=program2.group_id,
                filter_id=program2.filter_id,
                autosave=True,
                pipeline=[{"$match": {"objectId": "ZTF20aaelulu"}}],
            )

            program3 = Program(
                group_name="FRITZ_TEST_UPDATE_ANNOTATIONS", group_nickname="test3"
            )
            Filter(
                collection="ZTF_alerts",
                group_id=program3.group_id,
                filter_id=program3.filter_id,
                update_annotations=True,
                pipeline=[
                    {"$match": {"objectId": "ZTF20aapcmur"}}
                ],  # there are 3 alerts in the test set for this oid
            )

        # clean up old Kafka logs
        log("Cleaning up Kafka logs")
        subprocess.run(["rm", "-rf", path_logs / "kafka-logs", "/tmp/zookeeper"])

        log("Starting up ZooKeeper at localhost:2181")

        # start ZooKeeper in the background
        cmd_zookeeper = [
            os.path.join(config["path"]["kafka"], "bin", "zookeeper-server-start.sh"),
            "-daemon",
            os.path.join(config["path"]["kafka"], "config", "zookeeper.properties"),
        ]

        with open(path_logs / "zookeeper.stdout", "w") as stdout_zookeeper:
            # p_zookeeper =
            subprocess.run(
                cmd_zookeeper, stdout=stdout_zookeeper, stderr=subprocess.STDOUT
            )

        # take a nap while it fires up
        time.sleep(3)

        log("Starting up Kafka Server at localhost:9092")

        # start the Kafka server:
        cmd_kafka_server = [
            os.path.join(config["path"]["kafka"], "bin", "kafka-server-start.sh"),
            "-daemon",
            os.path.join(config["path"]["kafka"], "config", "server.properties"),
        ]

        with open(
            os.path.join(config["path"]["logs"], "kafka_server.stdout"), "w"
        ) as stdout_kafka_server:
            # p_kafka_server = subprocess.Popen(cmd_kafka_server, stdout=stdout_kafka_server, stderr=subprocess.STDOUT)
            # p_kafka_server =
            subprocess.run(cmd_kafka_server)

        # take a nap while it fires up
        time.sleep(3)

        # get kafka topic names with kafka-topics command
        cmd_topics = [
            os.path.join(config["path"]["kafka"], "bin", "kafka-topics.sh"),
            "--zookeeper",
            config["kafka"]["zookeeper.test"],
            "-list",
        ]

        topics = (
            subprocess.run(cmd_topics, stdout=subprocess.PIPE)
            .stdout.decode("utf-8")
            .split("\n")[:-1]
        )
        log(f"Found topics: {topics}")

        # create a test ZTF topic for the current UTC date
        date = datetime.datetime.utcnow().strftime("%Y%m%d")
        topic_name = f"ztf_{date}_programid1_test"

        if topic_name in topics:
            # topic previously created? remove first
            cmd_remove_topic = [
                os.path.join(config["path"]["kafka"], "bin", "kafka-topics.sh"),
                "--zookeeper",
                config["kafka"]["zookeeper.test"],
                "--delete",
                "--topic",
                topic_name,
            ]
            # print(kafka_cmd)
            remove_topic = (
                subprocess.run(cmd_remove_topic, stdout=subprocess.PIPE)
                .stdout.decode("utf-8")
                .split("\n")[:-1]
            )
            log(f"{remove_topic}")
            log(f"Removed topic: {topic_name}")
            time.sleep(1)

        if topic_name not in topics:
            log(f"Creating topic {topic_name}")

            cmd_create_topic = [
                os.path.join(config["path"]["kafka"], "bin", "kafka-topics.sh"),
                "--create",
                "--bootstrap-server",
                config["kafka"]["bootstrap.test.servers"],
                "--replication-factor",
                "1",
                "--partitions",
                "1",
                "--topic",
                topic_name,
            ]
            with open(
                os.path.join(config["path"]["logs"], "create_topic.stdout"), "w"
            ) as stdout_create_topic:
                # p_create_topic = \
                subprocess.run(
                    cmd_create_topic,
                    stdout=stdout_create_topic,
                    stderr=subprocess.STDOUT,
                )

        log("Starting up Kafka Producer")

        # spin up Kafka producer
        producer = Producer(
            {"bootstrap.servers": config["kafka"]["bootstrap.test.servers"]}
        )

        # small number of alerts that come with kowalski
        path_alerts = pathlib.Path("/app/data/ztf_alerts/20200202/")
        # grab some more alerts from gs://ztf-fritz/sample-public-alerts
        try:
            log("Grabbing more alerts from gs://ztf-fritz/sample-public-alerts")
            r = requests.get("https://www.googleapis.com/storage/v1/b/ztf-fritz/o")
            aa = r.json()["items"]
            ids = [pathlib.Path(a["id"]).parent for a in aa if "avro" in a["id"]]
        except Exception as e:
            log(
                "Grabbing alerts from gs://ztf-fritz/sample-public-alerts failed, but it is ok"
            )
            log(f"{e}")
            ids = []
        subprocess.run(
            [
                "gsutil",
                "-m",
                "cp",
                "-n",
                "gs://ztf-fritz/sample-public-alerts/*.avro",
                "/app/data/ztf_alerts/20200202/",
            ]
        )
        log(f"Fetched {len(ids)} alerts from gs://ztf-fritz/sample-public-alerts")
        # push!
        for p in path_alerts.glob("*.avro"):
            with open(str(p), "rb") as data:
                # Trigger any available delivery report callbacks from previous produce() calls
                producer.poll(0)

                log(f"Pushing {p}")

                # Asynchronously produce a message, the delivery report callback
                # will be triggered from poll() above, or flush() below, when the message has
                # been successfully delivered or failed permanently.
                producer.produce(topic_name, data.read(), callback=delivery_report)

                # Wait for any outstanding messages to be delivered and delivery report
                # callbacks to be triggered.
        producer.flush()

        log("Starting up Ingester")

        # digest and ingest
        watchdog(obs_date=date, test=True)
        log("Digested and ingested: all done!")

        # shut down Kafka server and ZooKeeper
        time.sleep(20)

        log("Shutting down Kafka Server at localhost:9092")
        # start the Kafka server:
        cmd_kafka_server_stop = [
            os.path.join(config["path"]["kafka"], "bin", "kafka-server-stop.sh"),
            os.path.join(config["path"]["kafka"], "config", "server.properties"),
        ]

        with open(
            os.path.join(config["path"]["logs"], "kafka_server.stdout"), "w"
        ) as stdout_kafka_server:
            # p_kafka_server_stop = \
            subprocess.run(
                cmd_kafka_server_stop,
                stdout=stdout_kafka_server,
                stderr=subprocess.STDOUT,
            )

        log("Shutting down ZooKeeper at localhost:2181")
        cmd_zookeeper_stop = [
            os.path.join(config["path"]["kafka"], "bin", "zookeeper-server-stop.sh"),
            os.path.join(config["path"]["kafka"], "config", "zookeeper.properties"),
        ]

        with open(
            os.path.join(config["path"]["logs"], "zookeeper.stdout"), "w"
        ) as stdout_zookeeper:
            # p_zookeeper_stop = \
            subprocess.run(
                cmd_zookeeper_stop, stdout=stdout_zookeeper, stderr=subprocess.STDOUT
            )

        log("Checking the ZTF alert collection states")
        mongo = Mongo(
            host=config["database"]["host"],
            port=config["database"]["port"],
            replica_set=config["database"]["replica_set"],
            username=config["database"]["username"],
            password=config["database"]["password"],
            db=config["database"]["db"],
            verbose=True,
        )
        collection_alerts = config["database"]["collections"]["alerts_ztf"]
        collection_alerts_aux = config["database"]["collections"]["alerts_ztf_aux"]
        n_alerts = mongo.db[collection_alerts].count_documents({})
        assert n_alerts == 313
        n_alerts_aux = mongo.db[collection_alerts_aux].count_documents({})
        assert n_alerts_aux == 145

        if config["misc"]["broker"]:
            log("Checking that posting to SkyPortal succeeded")

            # check number of candidates that passed the first filter
            resp = requests.get(
                program.base_url + f"/api/candidates?groupIDs={program.group_id}",
                headers=program.headers,
                timeout=3,
            )

            assert resp.status_code == requests.codes.ok
            result = resp.json()
            assert result["status"] == "success"
            assert "data" in result
            assert "totalMatches" in result["data"]
            assert result["data"]["totalMatches"] == 88

            # check that the only candidate that passed the second filter (ZTF20aaelulu) got saved as Source
            resp = requests.get(
                program2.base_url + f"/api/sources?group_ids={program2.group_id}",
                headers=program2.headers,
                timeout=3,
            )

            assert resp.status_code == requests.codes.ok
            result = resp.json()
            assert result["status"] == "success"
            assert "data" in result
            assert "totalMatches" in result["data"]
            assert result["data"]["totalMatches"] == 1
            assert "sources" in result["data"]
            assert result["data"]["sources"][0]["id"] == "ZTF20aaelulu"

예제 #9

0

파일 보기

파일: 2_get_sentiment.py 프로젝트: ONSBigData/ESSnet-WP7-UK-Case-Study

syuzhet = pd.read_csv("lexicons/syuzhet.csv")
syuzhet.drop_duplicates('word', inplace=True)

# nrc2 differ from the original nrc lexicon as 'trump', 'don' and 'jhon' keywords has been removed
nrc = pd.read_csv("lexicons/nrc2.csv",
                  header=0,
                  names=[
                      u'word', u'anger', u'anticipation', u'disgust', u'fear',
                      u'joy', u'negative', u'positive', u'sadness',
                      u'surprise', u'trust'
                  ])  # use modified lexicon
nrc.drop_duplicates('word', inplace=True)
nrc['value'] = nrc['positive'] - nrc['negative']

# Load data from Mongo
mongo = Mongo('facebook', 'comments')
docs = [doc for doc in mongo.collection.find()]
mongo.close()
mongo_ids = [doc.pop('_id', None)
             for doc in docs]  # exclude mongo generated ids
docs = d_to_df(docs)
docs['created_time'] = pd.to_datetime(docs['created_time'],
                                      format="%Y-%m-%dT%H:%M:%S+0000")
docs.set_index('created_time', inplace=True)
docs.drop_duplicates(['message', 'user.name', 'post_id'], inplace=True)
docs['n_sents'] = docs.message.apply(lambda x: len(sent_tokenize(x)))
docs['n_words'] = docs.message.apply(lambda x: len(tokenize.word_tokenize(x)))
docs = docs[docs['n_sents'] != 0].copy()

mongo = Mongo('facebook', 'posts')
posts = [doc for doc in mongo.collection.find()]

예제 #10

0

파일 보기

파일: ingest_gaia_edr3.py 프로젝트: xx70235/kowalski

                        help="batch size for ingestion")

    args = parser.parse_args()

    path = pathlib.Path(args.path)

    files = list(path.glob("Gaia*.csv"))

    catalog_name = "Gaia_EDR3"

    log("Connecting to DB")
    m = Mongo(
        host=config["database"]["host"],
        port=config["database"]["port"],
        replica_set=config["database"]["replica_set"],
        username=config["database"]["username"],
        password=config["database"]["password"],
        db=config["database"]["db"],
        verbose=args.v,
    )
    log("Successfully connected")

    # Create indexes in the database:
    log("Creating indexes")
    # 2D position on the sphere, ID:
    m.db[catalog_name].create_index([("coordinates.radec_geojson", "2dsphere"),
                                     ("_id", 1)],
                                    background=True)
    m.db[catalog_name].create_index([("ra", 1), ("dec", 1), ("parallax", 1)],
                                    background=True)
    m.db[catalog_name].create_index(

예제 #11

0

파일 보기

파일: ops_watcher_ztf.py 프로젝트: xx70235/kowalski

def get_ops():
    """
    Fetch and ingest ZTF ops data
    """
    # connect to MongoDB:
    print(f"{time_stamp()}: Connecting to DB.")
    mongo = Mongo(
        host=config["database"]["host"],
        port=config["database"]["port"],
        replica_set=config["database"]["replica_set"],
        username=config["database"]["username"],
        password=config["database"]["password"],
        db=config["database"]["db"],
        verbose=0,
    )
    print(f"{time_stamp()}: Successfully connected.")

    collection = "ZTF_ops"

    print(f"{time_stamp()}: Checking indexes.")
    mongo.db[collection].create_index(
        [("coordinates.radec_geojson", "2dsphere")], background=True)
    mongo.db[collection].create_index(
        [
            ("utc_start", pymongo.ASCENDING),
            ("utc_end", pymongo.ASCENDING),
            ("fileroot", pymongo.ASCENDING),
        ],
        background=True,
    )
    mongo.db[collection].create_index(
        [
            ("jd_start", pymongo.ASCENDING),
            ("jd_end", pymongo.ASCENDING),
            ("fileroot", pymongo.ASCENDING),
        ],
        background=True,
    )
    mongo.db[collection].create_index(
        [
            ("jd_start", pymongo.DESCENDING),
            ("pid", pymongo.ASCENDING),
            ("field", pymongo.ASCENDING),
        ],
        background=True,
    )

    # fetch full table
    print(f"{time_stamp()}: Fetching data.")
    url = config["ztf_ops"]["url"]
    r = requests.get(
        url,
        auth=(config["ztf_ops"]["username"], config["ztf_ops"]["password"]),
        verify=False,
    )
    if r.status_code == requests.codes.ok:
        with open(os.path.join(config["path"]["tmp"], "allexp.tbl"),
                  "wb") as f:
            f.write(r.content)
    else:
        raise Exception(f"{time_stamp()}: Failed to fetch allexp.tbl")

    latest = list(mongo.db[collection].find({},
                                            sort=[["$natural", -1]],
                                            limit=1))

    print(f"{time_stamp()}: Loading data.")
    df = pd.read_fwf(
        os.path.join(config["path"]["tmp"], "allexp.tbl"),
        comment="|",
        widths=[22, 4, 6, 4, 5, 8, 4, 9, 9, 7, 8, 29, 11, 25],
        header=None,
        names=[
            "utc_start",
            "sun_elevation",
            "exp",
            "filter",
            "type",
            "field",
            "pid",
            "ra",
            "dec",
            "slew",
            "wait",
            "fileroot",
            "programpi",
            "qcomment",
        ],
    )

    # drop comments:
    comments = df["utc_start"] == "UT_START"
    df = df.loc[~comments]

    for col in ["sun_elevation", "exp", "filter", "field", "pid"]:
        df[col] = df[col].apply(lambda x: int(x))
    for col in ["ra", "dec", "slew", "wait"]:
        df[col] = df[col].apply(lambda x: float(x))

    df["utc_start"] = df["utc_start"].apply(
        lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%f"))
    df["utc_end"] = df["utc_start"].add(
        df["exp"].apply(lambda x: datetime.timedelta(seconds=x)))

    df["jd_start"] = df["utc_start"].apply(lambda x: datetime_to_jd(x))
    df["jd_end"] = df["utc_end"].apply(lambda x: datetime_to_jd(x))

    # drop rows with utc_start <= c['utc_start]
    if len(latest) > 0:
        new = df["jd_start"] > latest[0].get("jd_start", 0)

        if sum(new):
            print(f"{time_stamp()}: Found {sum(new)} new records.")
            df = df.loc[new]
        else:
            # no new data? take a nap...
            print(f"{time_stamp()}: No new data found.")
            # close connection to db
            mongo.client.close()
            print(f"{time_stamp()}: Disconnected from db.")
            return

    documents = df.to_dict("records")
    documents = [mongify(doc) for doc in documents]

    print(f"{time_stamp()}: Inserting {len(documents)} documents.")

    mongo.insert_many(collection=collection, documents=documents)

    # close connection to db
    mongo.client.close()
    print(f"{time_stamp()}: Disconnected from db.")

예제 #12

0

파일 보기

def run(
    path: str,
    num_proc: int = multiprocessing.cpu_count(),
    batch_size: int = 2048,
    rm: bool = False,
    dry_run: bool = False,
):
    """Preprocess and Ingest ZTF matchfiles into Kowalski

    :param path: local path to matchfiles
    :param tag: matchfile release time tag
    :param num_proc: number of processes for parallel ingestion
    :param batch_size: batch size for light curve data ingestion
    :param rm: remove matchfiles after ingestion?
    :param dry_run: dry run?
    :return:
    """
    # connect to MongoDB:
    log("Connecting to DB")
    mongo = Mongo(
        host=config["database"]["host"],
        port=config["database"]["port"],
        replica_set=config["database"]["replica_set"],
        username=config["database"]["username"],
        password=config["database"]["password"],
        db=config["database"]["db"],
        verbose=0,
    )
    log("Successfully connected to DB")

    collections = {
        "exposures": "PTF_exposures",
        "sources": "PTF_sources",
    }

    # create indices:
    log("Creating indices")
    if not dry_run:
        mongo.db[collections["exposures"]].create_index(
            [("expid", pymongo.ASCENDING)], background=True)
        mongo.db[collections["sources"]].create_index(
            [("coordinates.radec_geojson", "2dsphere"),
             ("_id", pymongo.ASCENDING)],
            background=True,
        )
        mongo.db[collections["sources"]].create_index(
            [
                ("field", pymongo.ASCENDING),
                ("ccd", pymongo.ASCENDING),
                ("quad", pymongo.ASCENDING),
            ],
            background=True,
        )
        mongo.db[collections["sources"]].create_index(
            [("nobs", pymongo.ASCENDING), ("_id", pymongo.ASCENDING)],
            background=True)

    files = [str(f) for f in pathlib.Path(path).glob("PTF_*.pytable")]

    log(f"# files to process: {len(files)}")

    input_list = [(f, collections, batch_size, rm, dry_run)
                  for f in sorted(files)]
    # for a more even job distribution:
    random.shuffle(input_list)

    with multiprocessing.Pool(processes=num_proc) as pool:
        for _ in tqdm(pool.imap(process_file, input_list), total=len(files)):
            pass

예제 #13

0

파일 보기

def process_file(argument_list: Sequence):
    file_name, collections, batch_size, rm_file, dry_run = argument_list
    try:
        # connect to MongoDB:
        mongo = Mongo(
            host=config["database"]["host"],
            port=config["database"]["port"],
            replica_set=config["database"]["replica_set"],
            username=config["database"]["username"],
            password=config["database"]["password"],
            db=config["database"]["db"],
            verbose=0,
        )

        with tables.open_file(file_name, "r+") as f:
            for group in f.walk_groups():
                pass

            ff_basename = pathlib.Path(file_name).name
            # base id:
            _, field, filt, ccd, _, _ = ff_basename.split("_")
            field = int(field[1:])
            filt = int(filt[1:])
            readout_channel = int(ccd[1:])
            baseid = int(1e13 + field * 1e9 + readout_channel * 1e7 +
                         filt * 1e6)
            exposure_baseid = int(1e16 + field * 1e12 +
                                  readout_channel * 1e10 + filt * 1e9)

            def clean_up_document(group):
                """ Format passed in dicts for Mongo insertion """
                document = {}
                for k, v in group.items():
                    if k == "matchedSourceID":
                        document[k] = group[k]
                        continue
                    if k in sources_int_fields:
                        document[k] = [
                            int(group[k][key2]) for key2 in group[k].keys()
                        ]
                    else:
                        document[k] = [
                            float(group[k][key2]) for key2 in group[k].keys()
                        ]

                # document["ra"] = document["ra"][0]
                # document["dec"] = document["dec"][0]

                # generate unique _id:
                document["_id"] = baseid + document["matchedSourceID"]
                document["filter"] = filt
                document["field"] = field
                document["ccd"] = ccd

                # GeoJSON for 2D indexing
                document["coordinates"] = dict()
                _ra = np.median(document["ra"])
                _dec = np.median(document["dec"])
                _radec_str = [deg2hms(_ra), deg2dms(_dec)]
                document["coordinates"]["radec_str"] = _radec_str
                # for GeoJSON, must be lon:[-180, 180], lat:[-90, 90] (i.e. in deg)
                _radec_geojson = [_ra - 180.0, _dec]
                document["coordinates"]["radec_geojson"] = {
                    "type": "Point",
                    "coordinates": _radec_geojson,
                }

                document["data"] = []
                for t, m, e, f, _ra, _dec in zip(
                        document["mjd"],
                        document["mag"],
                        document["magErr"],
                        document["ipacFlags"],
                        document["ra"],
                        document["dec"],
                ):
                    data_point = {
                        "mjd": t,
                        "mag": m,
                        "magerr": e,
                        "ipacflags": f,
                        "ra": _ra,
                        "dec": _dec,
                    }
                    # convert types for pymongo:
                    for k, v in data_point.items():
                        if k in sourcedata_int_fields:
                            data_point[k] = int(data_point[k])
                        else:
                            data_point[k] = float(data_point[k])
                            if k == "mjd":
                                data_point[k] = round(data_point[k], 5)
                            elif k not in ("ra", "dec"):
                                data_point[k] = round(data_point[k], 3)
                    document["data"].append(data_point)
                del (
                    document["mjd"],
                    document["mag"],
                    document["magErr"],
                    document["ipacFlags"],
                    document["ra"],
                    document["dec"],
                )
                document["data"].sort(key=lambda x: x["mjd"])

                return document

            exposures = pd.DataFrame.from_records(group.exposures[:])
            # prepare docs to ingest into db:
            docs_exposures = []
            for index, row in exposures.iterrows():
                try:
                    doc = row.to_dict()
                    # unique exposure id:
                    doc["_id"] = exposure_baseid + doc["expid"]
                    doc["matchfile"] = ff_basename
                    doc["filter"] = filt
                    doc["field"] = field
                    doc["ccd"] = ccd
                    docs_exposures.append(doc)
                except Exception as exception:
                    log(str(exception))

            # ingest exposures in one go:
            if not dry_run:
                mongo.insert_many(collection=collections["exposures"],
                                  documents=docs_exposures)

            sources = pd.DataFrame.from_records(
                group["sources"].read(),
                index="matchedSourceID",
                exclude=sources_fields_to_exclude,
            )
            sourcedatas = pd.DataFrame.from_records(
                group["sourcedata"].read(),
                index="matchedSourceID",
                exclude=sourcedata_to_exclude,
            )

            merged = sources.merge(sourcedatas,
                                   left_index=True,
                                   right_index=True)
            groups = merged.groupby("matchedSourceID")

            # light curves
            docs_sources = []
            batch_num = 1
            # fixme? skip transients
            for row, group in groups:
                try:
                    groupdict = group.to_dict()
                    groupdict["matchedSourceID"] = row
                    current_doc = clean_up_document(groupdict)
                    docs_sources.append(current_doc)
                except Exception as exception:
                    log(str(exception))

                # ingest in batches
                try:
                    if len(docs_sources) % batch_size == 0 and len(
                            docs_sources) != 0:
                        if not dry_run:
                            mongo.insert_many(
                                collection=collections["sources"],
                                documents=docs_sources,
                            )
                        # flush:
                        docs_sources = []
                        batch_num += 1
                except Exception as exception:
                    log(str(exception))

        # ingest remaining
        while len(docs_sources) > 0:
            try:
                # In case mongo crashed and disconnected, docs will accumulate in documents
                # keep on trying to insert them until successful
                if not dry_run:
                    mongo.insert_many(collection=collections["sources"],
                                      documents=docs_sources)
                    # flush:
                    docs_sources = []

            except Exception as e:
                traceback.print_exc()
                log(e)
                log("Failed, waiting 5 seconds to retry")
                time.sleep(5)

        mongo.client.close()

    except Exception as e:
        traceback.print_exc()
        log(e)
        # if there was an error, return without potentially deleting the file
        return

    try:
        if rm_file:
            os.remove(file_name)
    finally:
        pass

예제 #14

0

파일 보기

import json
import pandas as pd
import tushare as ts

from bson.json_util import dumps
from datetime import date as Date
from datetime import timedelta as Period
from utils import Mongo as mongo_utils
from utils import Stock as stock_utils

client = mongo_utils.get_mongo_client()

orig_db = client.tickdata
db = client.stock

start_date = Date(2018, 1, 13)
end_date = Date(2016, 1, 1)

TICK_COLLECTION = "tickdata"

PAUSE = 10
RETRY = 10000

for col in orig_db.collection_names():
    if '_' in col:
        code = col.split('_')[0]
        date = col.split('_')[1]
        print "start processing, code: " + code + ", date: " + date
        date = Date(int(date[0:4]), int(date[4:6]), int(date[6:8]))
        if date < start_date and date > end_date:
            cursor = orig_db[col].find({})

예제 #15

0

파일 보기

            post_id,
            fields=
            'reactions.type(LIKE).limit(0).summary(total_count).as(like),reactions.type(LOVE).limit(0).summary(total_count).as(love),reactions.type(WOW).limit(0).summary(total_count).as(wow),reactions.type(HAHA).limit(0).summary(total_count).as(haha),reactions.type(SAD).limit(0).summary(total_count).as(sad),reactions.type(ANGRY).limit(0).summary(total_count).as(angry),reactions.type(THANKFUL).limit(0).summary(total_count).as(thankful),reactions.type(NONE).limit(0).summary(total_count).as(total)'
        )
        post.update(reactions)
        post = process_post(post)
        posts_list.append(post)

    # Adding extra bits from the guardian
    for post in posts_list:
        extra = get_extra(post['article_url'])
        if extra:
            post.update(extra)

    # Inserting posts collected in Mongo
    mongo = Mongo('facebook', 'posts')
    for post in posts_list:
        mongo.process_item(post)
    mongo.close()
    del mongo

    # Collect Comments data
    for idx, post in enumerate(posts_list):
        post_id = post['post_id']
        print "Extracting %d comments for post %d ..." % (
            post['comment_count'], idx)
        comments = graph.get_all_connections(
            post_id,
            'comments',
            limit=100,
            fields='created_time,from,like_count,message,id,comment_count')

예제 #16

0

파일 보기

import json
import tushare as ts

from datetime import date as Date
from datetime import timedelta as Period
from utils import Mongo as mongo_utils
from utils import Stock as stock_utils

client =  mongo_utils.get_mongo_client()

db = client.stock

start_date = Date(2018,1,12)
end_date = Date(2017,12,1)

PAUSE = 10
RETRY = 10000


cursor = db.tickdata.find({'code': '002117', 'date': '2017-11-02'})

data = mongo_utils.convert_cursor_to_dataframe(cursor)

print data

예제 #17

0

파일 보기

파일: ingest_gaia_edr3.py 프로젝트: xx70235/kowalski

def process_file(args):
    file, collection, batch_size, rm, verbose = args
    # connect to MongoDB:
    if verbose:
        log("Connecting to DB")
    mongo = Mongo(
        host=config["database"]["host"],
        port=config["database"]["port"],
        replica_set=config["database"]["replica_set"],
        username=config["database"]["username"],
        password=config["database"]["password"],
        db=config["database"]["db"],
        verbose=0,
    )
    if verbose:
        log("Successfully connected")

    collection = "Gaia_EDR3"

    if verbose:
        log(f"Processing {file}")

    for chunk_index, dataframe_chunk in enumerate(
            pd.read_csv(file, chunksize=batch_size)):

        if verbose:
            log(f"{file}: processing batch # {chunk_index + 1}")

        dataframe_chunk["_id"] = dataframe_chunk["source_id"].apply(
            lambda x: str(x))

        batch = dataframe_chunk.fillna("DROPMEPLEASE").to_dict(
            orient="records")

        # pop nulls - save space
        batch = [{
            key: value
            for key, value in document.items()
            if value not in ("DROPMEPLEASE", "NOT_AVAILABLE")
        } for document in batch]

        bad_document_indexes = []

        for document_index, document in enumerate(batch):
            try:
                # GeoJSON for 2D indexing
                document["coordinates"] = dict()
                # string format: H:M:S, D:M:S
                document["coordinates"]["radec_str"] = [
                    deg2hms(document["ra"]),
                    deg2dms(document["dec"]),
                ]
                # for GeoJSON, must be lon:[-180, 180], lat:[-90, 90] (i.e. in deg)
                _radec_geojson = [document["ra"] - 180.0, document["dec"]]
                document["coordinates"]["radec_geojson"] = {
                    "type": "Point",
                    "coordinates": _radec_geojson,
                }
            except Exception as e:
                if verbose:
                    log(str(e))
                bad_document_indexes.append(document_index)

        if len(bad_document_indexes) > 0:
            if verbose:
                log("Removing bad docs")
            for index in sorted(bad_document_indexes, reverse=True):
                del batch[index]

        # ingest batch
        mongo.insert_many(collection=collection, documents=batch)

    # disconnect from db:
    try:
        mongo.client.close()
    finally:
        if verbose:
            log("Successfully disconnected from db")

    # clean up:
    if rm:
        os.remove(file)
        if verbose:
            log(f"Successfully removed {file}")

예제 #18

0

파일 보기

파일: ingest_ztf_matchfiles.py 프로젝트: mcoughlin/kowalski

def process_file(argument_list: Sequence):
    file_name, collections, batch_size, rm_file, dry_run = argument_list
    try:
        # connect to MongoDB:
        mongo = Mongo(
            host=config["database"]["host"],
            port=config["database"]["port"],
            replica_set=config["database"]["replica_set"],
            username=config["database"]["username"],
            password=config["database"]["password"],
            db=config["database"]["db"],
            verbose=0,
        )

        with tables.open_file(file_name, "r+") as f:
            group = f.root.matches
            ff_basename = pathlib.Path(file_name).name
            # base id:
            _, field, filt, ccd, quadrant, _ = ff_basename.split("_")
            field = int(field)
            filt = filters[filt]
            ccd = int(ccd[1:])
            quadrant = int(quadrant[1:])
            readout_channel = ccd_quad_to_rc(ccd=ccd, quad=quadrant)
            baseid = int(1e13 + field * 1e9 + readout_channel * 1e7 +
                         filt * 1e6)
            exposure_baseid = int(1e16 + field * 1e12 +
                                  readout_channel * 1e10 + filt * 1e9)

            def clean_up_document(document):
                """ Format passed in dicts for Mongo insertion """
                # convert types for pymongo:
                for k, v in document.items():
                    if k != "data":
                        if k in sources_int_fields:
                            document[k] = int(document[k])
                        else:
                            document[k] = float(document[k])
                            if k not in ("ra", "dec"):
                                # this will save a lot of space:
                                document[k] = round(document[k], 3)

                # generate unique _id:
                document["_id"] = baseid + document["matchid"]
                document["filter"] = filt
                document["field"] = field
                document["ccd"] = ccd
                document["quad"] = quadrant
                document["rc"] = readout_channel

                # GeoJSON for 2D indexing
                document["coordinates"] = dict()
                _ra = document["ra"]
                _dec = document["dec"]
                _radec_str = [deg2hms(_ra), deg2dms(_dec)]
                document["coordinates"]["radec_str"] = _radec_str
                # for GeoJSON, must be lon:[-180, 180], lat:[-90, 90] (i.e. in deg)
                _radec_geojson = [_ra - 180.0, _dec]
                document["coordinates"]["radec_geojson"] = {
                    "type": "Point",
                    "coordinates": _radec_geojson,
                }
                document["data"].sort(key=lambda x: x["hjd"])
                for data_point in document["data"]:
                    # convert types for pymongo:
                    for k, v in data_point.items():
                        if k in sourcedata_int_fields:
                            data_point[k] = int(data_point[k])
                        else:
                            data_point[k] = float(data_point[k])
                            if k not in ("ra", "dec", "hjd"):
                                data_point[k] = round(data_point[k], 3)
                            elif k == "hjd":
                                data_point[k] = round(data_point[k], 5)
                    # generate unique exposure id's that match _id's in exposures collection
                    data_point[
                        "uexpid"] = exposure_baseid + data_point["expid"]

                return document

            exposures = pd.DataFrame.from_records(group.exposures[:])
            # prepare docs to ingest into db:
            docs_exposures = []
            for index, row in exposures.iterrows():
                try:
                    doc = row.to_dict()
                    # unique exposure id:
                    doc["_id"] = exposure_baseid + doc["expid"]
                    doc["matchfile"] = ff_basename
                    doc["filter"] = filt
                    doc["field"] = field
                    doc["ccd"] = ccd
                    doc["quad"] = quadrant
                    doc["rc"] = readout_channel
                    docs_exposures.append(doc)
                except Exception as exception:
                    log(str(exception))

            # ingest exposures in one go:
            if not dry_run:
                mongo.insert_many(collection=collections["exposures"],
                                  documents=docs_exposures)

            # light curves
            docs_sources = []
            batch_num = 1
            # fixme? skip transients
            # for source_type in ('source', 'transient'):
            for source_type in ("source", ):
                sources = pd.DataFrame.from_records(
                    group[f"{source_type}s"].read(),
                    index="matchid",
                    exclude=sources_fields_to_exclude,
                )
                # Load in percentiles separately to compute the IQR column
                # because Pandas DF from_records() only wants 2-D tables
                percentiles = group[f"{source_type}s"].col("percentiles")
                # Ignore float errors due to infinity values
                old_settings = np.seterr(all="ignore")
                iqr = np.round(percentiles[:, 8] - percentiles[:, 3], 3)
                np.seterr(**old_settings)
                sources["iqr"] = iqr

                sourcedatas = pd.DataFrame.from_records(
                    group[f"{source_type}data"][:],
                    index="matchid",
                    exclude=[
                        "ypos",
                        "xpos",
                        "mjd",
                        "psfflux",
                        "psffluxerr",
                        "magerrmodel",
                    ],
                )
                sourcedatas.rename(columns={
                    "ra": "ra_data",
                    "dec": "dec_data"
                },
                                   inplace=True)
                sourcedata_colnames = sourcedatas.columns.values
                # Join sources and their data
                merged = sources.merge(sourcedatas,
                                       left_index=True,
                                       right_index=True)
                prev_matchid = None
                current_doc = None
                for row in merged.itertuples():
                    matchid = row[0]
                    try:
                        # At a new source
                        if matchid != prev_matchid:
                            # Done with last source; save
                            if current_doc is not None:
                                current_doc = clean_up_document(current_doc)
                                docs_sources.append(current_doc)

                            # Set up new doc
                            doc = dict(row._asdict())
                            doc["matchid"] = doc["Index"]
                            doc.pop("Index")
                            # Coerce the source data info into its own nested array
                            first_data_row = {}
                            for col in sourcedata_colnames:
                                if col not in ["dec_data", "ra_data"]:
                                    first_data_row[col] = doc[col]
                                else:
                                    real_col = col.split("_data")[0]
                                    first_data_row[real_col] = doc[col]
                                doc.pop(col)
                            doc["data"] = [first_data_row]
                            current_doc = doc
                        # For continued source, just append new data row
                        else:
                            data_row = {}
                            data = dict(row._asdict())
                            for col in sourcedata_colnames:
                                if col not in ["dec_data", "ra_data"]:
                                    data_row[col] = data[col]
                                else:
                                    real_col = col.split("_data")[0]
                                    data_row[real_col] = data[col]

                            current_doc["data"].append(data_row)

                        prev_matchid = matchid

                    except Exception as exception:
                        log(str(exception))

                    # ingest in batches
                    try:
                        if (len(docs_sources) % batch_size == 0
                                and len(docs_sources) != 0):
                            if not dry_run:
                                mongo.insert_many(
                                    collection=collections["sources"],
                                    documents=docs_sources,
                                )
                            # flush:
                            docs_sources = []
                            batch_num += 1
                    except Exception as exception:
                        log(str(exception))

        # Clean up and append the last doc
        if current_doc is not None:
            current_doc = clean_up_document(current_doc)
            docs_sources.append(current_doc)

        # ingest remaining
        while len(docs_sources) > 0:
            try:
                # In case mongo crashed and disconnected, docs will accumulate in documents
                # keep on trying to insert them until successful
                if not dry_run:
                    mongo.insert_many(collection=collections["sources"],
                                      documents=docs_sources)
                    # flush:
                    docs_sources = []

            except Exception as e:
                traceback.print_exc()
                log(e)
                log("Failed, waiting 5 seconds to retry")
                time.sleep(5)

        mongo.client.close()

    except Exception as e:
        traceback.print_exc()
        log(e)
        # if there was an error, return without potentially deleting the file
        return

    try:
        if rm_file:
            os.remove(file_name)
    finally:
        pass