def clean_up_document(document): """ Format passed in dicts for Mongo insertion """ # convert types for pymongo: for k, v in document.items(): if k != "data": if k in sources_int_fields: document[k] = int(document[k]) else: document[k] = float(document[k]) if k not in ("ra", "dec"): # this will save a lot of space: document[k] = round(document[k], 3) # generate unique _id: document["_id"] = baseid + document["matchid"] document["filter"] = filt document["field"] = field document["ccd"] = ccd document["quad"] = quadrant document["rc"] = readout_channel # GeoJSON for 2D indexing document["coordinates"] = dict() _ra = document["ra"] _dec = document["dec"] _radec_str = [deg2hms(_ra), deg2dms(_dec)] document["coordinates"]["radec_str"] = _radec_str # for GeoJSON, must be lon:[-180, 180], lat:[-90, 90] (i.e. in deg) _radec_geojson = [_ra - 180.0, _dec] document["coordinates"]["radec_geojson"] = { "type": "Point", "coordinates": _radec_geojson, } document["data"].sort(key=lambda x: x["hjd"]) for data_point in document["data"]: # convert types for pymongo: for k, v in data_point.items(): if k in sourcedata_int_fields: data_point[k] = int(data_point[k]) else: data_point[k] = float(data_point[k]) if k not in ("ra", "dec", "hjd"): data_point[k] = round(data_point[k], 3) elif k == "hjd": data_point[k] = round(data_point[k], 5) # generate unique exposure id's that match _id's in exposures collection data_point[ "uexpid"] = exposure_baseid + data_point["expid"] return document
def process_file(file, collection, batch_size): # connect to MongoDB: log("Connecting to DB") mongo = Mongo( host=config["database"]["host"], port=config["database"]["port"], replica_set=config["database"]["replica_set"], username=config["database"]["username"], password=config["database"]["password"], db=config["database"]["db"], verbose=0, ) log("Successfully connected") collection = "VLASS_DR1" log(f"Processing {file}") names = [ "Component_name", "RA", "DEC", "E_RA", "E_DEC", "Total_flux", "E_Total_flux", "Peak_flux", "E_Peak_flux", "Maj", "E_Maj", "Min", "E_Min", "Duplicate_flag", "Quality_flag", ] for chunk_index, dataframe_chunk in enumerate( pd.read_csv(file, chunksize=batch_size) ): log(f"{file}: processing batch # {chunk_index + 1}") dataframe_chunk = dataframe_chunk[names] dataframe_chunk = dataframe_chunk[dataframe_chunk["Duplicate_flag"] < 2] dataframe_chunk = dataframe_chunk[dataframe_chunk["Quality_flag"] == 0] batch = dataframe_chunk.to_dict(orient="records") bad_document_indexes = [] for document_index, document in enumerate(batch): try: # GeoJSON for 2D indexing document["coordinates"] = dict() # string format: H:M:S, D:M:S document["coordinates"]["radec_str"] = [ deg2hms(document["RA"]), deg2dms(document["DEC"]), ] # for GeoJSON, must be lon:[-180, 180], lat:[-90, 90] (i.e. in deg) _radec_geojson = [document["RA"] - 180.0, document["DEC"]] document["coordinates"]["radec_geojson"] = { "type": "Point", "coordinates": _radec_geojson, } except Exception as e: log(str(e)) bad_document_indexes.append(document_index) if len(bad_document_indexes) > 0: log("Removing bad docs") for index in sorted(bad_document_indexes, reverse=True): del batch[index] # ingest batch mongo.insert_many(collection=collection, documents=batch) # disconnect from db: try: mongo.client.close() finally: log("Successfully disconnected from db")
def process_file(file, collection, batch_size): # connect to MongoDB: log("Connecting to DB") mongo = Mongo( host=config["database"]["host"], port=config["database"]["port"], replica_set=config["database"]["replica_set"], username=config["database"]["username"], password=config["database"]["password"], db=config["database"]["db"], verbose=0, ) log("Successfully connected") collection = "IGAPS_DR2" log(f"Processing {file}") names = [ "name", "RA", "DEC", "gal_long", "gal_lat", "sourceID", "posErr", "mergedClass", "pStar", "pGalaxy", "pNoise", "i", "iErr", "iAB", "iEll", "iClass", "iDeblend", "iSaturated", "iVignetted", "iTrail", "iTruncated", "iBadPix", "iMJD", "iSeeing", "iDetectionID", "iDeltaRA", "iDeltaDEC", "ha", "haErr", "haAB", "haEll", "haClass", "haDeblend", "haSaturated", "haVignetted", "haTrail", "haTruncated", "haBadPix", "haMJD", "haSeeing", "haDetectionID", "haDeltaRA", "haDeltaDEC", "r_I", "rErr_I", "rAB_I", "rEll_I", "rClass_I", "rDeblend_I", "rSaturated_I", "rVignetted_I", "rTrail_I", "rTruncated_I", "rBadPix_I", "rMJD_I", "rSeeing_I", "rDetectionID_I", "r_U", "rErr_U", "rAB_U", "rEll_U", "rClass_U", "rDeblend_U", "rSaturated_U", "rVignetted_U", "rTrail_U", "rTruncated_U", "rBadPix_U", "rMJD_U", "rSeeing_U", "rDetectionID_U", "rDeltaRA_U", "rDeltaDEC_U", "g", "gErr", "gAB", "gEll", "gClass", "gDeblend", "gSaturated", "gVignetted", "gTrail", "gTruncated", "gBadPix", "gmask", "gMJD", "gSeeing", "gDetectionID", "gDeltaRA", "gDeltaDEC", "U_RGO", "UErr", "UEll", "UClass", "UDeblend", "USaturated", "UVignetted", "UTrail", "UTruncated", "UBadPix", "UMJD", "USeeing", "UDetectionID", "UDeltaRA", "UDeltaDEC", "brightNeighb", "deblend", "saturated", "nBands", "errBits", "nObs_I", "nObs_U", "fieldID_I", "fieldID_U", "fieldGrade_I", "fieldGrade_U", "emitter", "variable", "SourceID2", "i2", "i2Err", "i2Class", "i2Seeing", "i2MJD", "i2DeltaRA", "i2DeltaDEC", "i2DetectionID", "i2ErrBits", "ha2", "ha2Err", "ha2Class", "ha2Seeing", "ha2MJD", "ha2DeltaRA", "ha2DeltaDEC", "ha2DetectionID", "ha2ErrBits", "r2_I", "r2Err_I", "r2Class_I", "r2Seeing_I", "r2MJD_I", "r2DeltaRA_I", "r2DeltaDEC_I", "r2DetectionID_I", "r2ErrBits_I", "r2_U", "r2Err_U", "r2Class_U", "r2Seeing_U", "r2MJD_U", "r2DeltaRA_U", "r2DeltaDEC_U", "r2DetectionID_U", "r2ErrBits_U", "g2", "g2Err", "g2Class", "g2Seeing", "g2MJD", "g2DeltaRA", "g2DeltaDEC", "g2DetectionID", "g2ErrBits", "U_RGO2", "U2Err", "U2Class", "U2Seeing", "U2MJD", "U2DeltaRA", "U2DeltaDEC", "U2DetectionID", "U2ErrBits", "errBits2", ] with fits.open(file) as hdulist: nhdu = 1 dataframe = pd.DataFrame(np.asarray(hdulist[nhdu].data), columns=names) for chunk_index, dataframe_chunk in dataframe.groupby( np.arange(len(dataframe)) // batch_size): log(f"{file}: processing batch # {chunk_index + 1}") for col, dtype in dataframe_chunk.dtypes.items(): if dtype == np.object: dataframe_chunk[col] = dataframe_chunk[col].apply( lambda x: x.decode("utf-8")) batch = dataframe_chunk.to_dict(orient="records") batch = dataframe_chunk.fillna("DROPMEPLEASE").to_dict( orient="records") # pop nulls - save space batch = [{ key: value for key, value in document.items() if value != "DROPMEPLEASE" } for document in batch] bad_document_indexes = [] for document_index, document in enumerate(batch): try: # GeoJSON for 2D indexing document["coordinates"] = dict() # string format: H:M:S, D:M:S document["coordinates"]["radec_str"] = [ deg2hms(document["RA"]), deg2dms(document["DEC"]), ] # for GeoJSON, must be lon:[-180, 180], lat:[-90, 90] (i.e. in deg) _radec_geojson = [document["RA"] - 180.0, document["DEC"]] document["coordinates"]["radec_geojson"] = { "type": "Point", "coordinates": _radec_geojson, } except Exception as e: log(str(e)) bad_document_indexes.append(document_index) if len(bad_document_indexes) > 0: log("Removing bad docs") for index in sorted(bad_document_indexes, reverse=True): del batch[index] # ingest batch mongo.insert_many(collection=collection, documents=batch) # disconnect from db: try: mongo.client.close() finally: log("Successfully disconnected from db")
def process_file(_file, _collections, _batch_size=2048, _keep_all=False, _rm_file=False, verbose=False, _dry_run=False): # connect to MongoDB: if verbose: print('Connecting to DB') _client, _db = connect_to_db() if verbose: print('Successfully connected') if verbose: print(f'processing {_file}') try: with tables.open_file(_file) as f: # print(f.root['/matches'].attrs) group = f.root.matches # print(f.root.matches.exposures._v_attrs) # print(f.root.matches.sources._v_attrs) # print(f.root.matches.sourcedata._v_attrs) ff_basename = os.path.basename(_file) # base id: _, field, filt, ccd, quad, _ = ff_basename.split('_') field = int(field) filt = filters[filt] ccd = int(ccd[1:]) quad = int(quad[1:]) rc = ccd_quad_2_rc(ccd=ccd, quad=quad) baseid = int(1e13 + field * 1e9 + rc * 1e7 + filt * 1e6) if verbose: # print(f'{_file}: {field} {filt} {ccd} {quad}') print(f'{_file}: baseid {baseid}') exp_baseid = int(1e16 + field * 1e12 + rc * 1e10 + filt * 1e9) # print(int(1e16), int(field*1e12), int(rc*1e10), int(filt*1e9), exp_baseid) # tic = time.time() exposures = pd.DataFrame.from_records(group.exposures[:]) # exposures_colnames = exposures.columns.values # print(exposures_colnames) # prepare docs to ingest into db: docs_exposures = [] for index, row in exposures.iterrows(): try: doc = row.to_dict() # unique exposure id: doc['_id'] = exp_baseid + doc['expid'] # print(exp_baseid, doc['expid'], doc['_id']) doc['matchfile'] = ff_basename doc['filter'] = filt doc['field'] = field doc['ccd'] = ccd doc['quad'] = quad doc['rc'] = rc # pprint(doc) docs_exposures.append(doc) except Exception as e_: print(str(e_)) # ingest exposures in one go: if not _dry_run: if verbose: print(f'ingesting exposures for {_file}') insert_multiple_db_entries( _db, _collection=_collections['exposures'], _db_entries=docs_exposures) if verbose: print(f'done ingesting exposures for {_file}') docs_sources = [] batch_num = 1 # fixme? skip transients # for source_type in ('source', 'transient'): for source_type in ('source', ): sources_colnames = group[f'{source_type}s'].colnames sources = np.array(group[f'{source_type}s'].read()) # sources = group[f'{source_type}s'].read() # sourcedata = pd.DataFrame.from_records(group[f'{source_type}data'][:]) # sourcedata_colnames = sourcedata.columns.values sourcedata_colnames = group[f'{source_type}data'].colnames # sourcedata = np.array(group[f'{source_type}data'].read()) for source in sources: try: doc = dict(zip(sources_colnames, source)) # grab data first sourcedata = np.array( group[f'{source_type}data'].read_where( f'matchid == {doc["matchid"]}')) # print(sourcedata) doc_data = [ dict(zip(sourcedata_colnames, sd)) for sd in sourcedata ] # skip sources that are only detected in the reference image: if len(doc_data) == 0: continue # dump unwanted fields: if not _keep_all: # do not store all fields to save space # sources_fields_to_keep = ('astrometricrms', 'chisq', 'con', 'lineartrend', # 'magrms', 'maxslope', 'meanmag', 'medianabsdev', # 'medianmag', 'minmag', 'maxmag', # 'nabovemeanbystd', 'nbelowmeanbystd', # 'nconsecabovemeanbystd', 'nconsecbelowmeanbystd', # 'nconsecfrommeanbystd', # 'nmedianbufferrange', # 'npairposslope', 'percentiles', 'skewness', # 'smallkurtosis', 'stetsonj', 'stetsonk', # 'vonneumannratio', 'weightedmagrms', # 'weightedmeanmag', # 'dec', 'matchid', 'nobs', 'ngoodobs', # 'ra', 'refchi', 'refmag', 'refmagerr', 'refsharp', 'refsnr') # sources_fields_to_keep = ('meanmag', # 'percentiles', # 'vonneumannratio', # 'dec', 'matchid', 'nobs', # 'ra', 'refchi', 'refmag', 'refmagerr', 'refsharp', 'refsnr') # refmagerr = 1.0857/refsnr sources_fields_to_keep = ('meanmag', 'percentiles', 'vonneumannratio', 'dec', 'matchid', 'nobs', 'ra', 'refchi', 'refmag', 'refmagerr', 'refsharp') doc_keys = list(doc.keys()) for kk in doc_keys: if kk not in sources_fields_to_keep: doc.pop(kk) # convert types for pymongo: for k, v in doc.items(): # types.add(type(v)) if np.issubdtype(type(v), np.integer): doc[k] = int(doc[k]) if np.issubdtype(type(v), np.inexact): doc[k] = float(doc[k]) if k not in ('ra', 'dec'): doc[k] = round(doc[k], 3) # convert numpy arrays into lists if type(v) == np.ndarray: doc[k] = doc[k].tolist() # generate unique _id: doc['_id'] = baseid + doc['matchid'] # from Frank Masci: compute ObjectID, same as serial key in ZTF Objects DB table in IRSA. # oid = ((fieldid * 100000 + fid * 10000 + ccdid * 100 + qid * 10) * 10 ** 7) + int(matchid) doc['iqr'] = doc['percentiles'][8] - doc[ 'percentiles'][3] doc['iqr'] = round(doc['iqr'], 3) doc.pop('percentiles') # doc['matchfile'] = ff_basename doc['filter'] = filt doc['field'] = field doc['ccd'] = ccd doc['quad'] = quad doc['rc'] = rc # doc['source_type'] = source_type # GeoJSON for 2D indexing doc['coordinates'] = {} _ra = doc['ra'] _dec = doc['dec'] _radec = [_ra, _dec] # string format: H:M:S, D:M:S # tic = time.time() _radec_str = [deg2hms(_ra), deg2dms(_dec)] # print(time.time() - tic) # print(_radec_str) doc['coordinates']['radec_str'] = _radec_str # for GeoJSON, must be lon:[-180, 180], lat:[-90, 90] (i.e. in deg) _radec_geojson = [_ra - 180.0, _dec] doc['coordinates']['radec_geojson'] = { 'type': 'Point', 'coordinates': _radec_geojson } # radians and degrees: # doc['coordinates']['radec_rad'] = [_ra * np.pi / 180.0, _dec * np.pi / 180.0] # doc['coordinates']['radec_deg'] = [_ra, _dec] # data doc['data'] = doc_data # print(doc['data']) if not _keep_all: # do not store all fields to save space if len(doc_data) > 0: # magerr = 1.0857/snr sourcedata_fields_to_keep = ( 'catflags', 'chi', 'dec', 'expid', 'hjd', 'mag', 'magerr', 'programid', 'ra', # 'relphotflags', 'snr', 'sharp') doc_keys = list(doc_data[0].keys()) for ddi, ddp in enumerate(doc['data']): for kk in doc_keys: if kk not in sourcedata_fields_to_keep: doc['data'][ddi].pop(kk) for dd in doc['data']: # convert types for pymongo: for k, v in dd.items(): # types.add(type(v)) if np.issubdtype(type(v), np.integer): dd[k] = int(dd[k]) if np.issubdtype(type(v), np.inexact): dd[k] = float(dd[k]) if k not in ('ra', 'dec', 'hjd'): dd[k] = round(dd[k], 3) elif k == 'hjd': dd[k] = round(dd[k], 5) # convert numpy arrays into lists if type(v) == np.ndarray: dd[k] = dd[k].tolist() # generate unique exposure id's that match _id's in exposures collection dd['uexpid'] = exp_baseid + dd['expid'] # pprint(doc) docs_sources.append(doc) except Exception as e_: print(str(e_)) # ingest in batches try: if len(docs_sources) % _batch_size == 0: if verbose: print( f'inserting batch #{batch_num} for {_file}' ) if not _dry_run: insert_multiple_db_entries( _db, _collection=_collections['sources'], _db_entries=docs_sources, _verbose=False) # flush: docs_sources = [] batch_num += 1 except Exception as e_: print(str(e_)) # ingest remaining while len(docs_sources) > 0: try: # In case mongo crashed and disconnected, docs will accumulate in documents # keep on trying to insert them until successful if verbose: print(f'inserting batch #{batch_num} for {_file}') if not _dry_run: insert_multiple_db_entries( _db, _collection=_collections['sources'], _db_entries=docs_sources, _verbose=False) # flush: docs_sources = [] except Exception as e: traceback.print_exc() print(e) print('Failed, waiting 5 seconds to retry') time.sleep(5) except Exception as e: traceback.print_exc() print(e) # disconnect from db: try: if _rm_file: os.remove(_file) if verbose: print(f'Successfully removed {_file}') _client.close() if verbose: if verbose: print('Successfully disconnected from db') finally: pass
def process_file(args): file, collection, batch_size, rm, verbose = args # connect to MongoDB: if verbose: log("Connecting to DB") mongo = Mongo( host=config["database"]["host"], port=config["database"]["port"], replica_set=config["database"]["replica_set"], username=config["database"]["username"], password=config["database"]["password"], db=config["database"]["db"], verbose=0, ) if verbose: log("Successfully connected") collection = "Gaia_EDR3" if verbose: log(f"Processing {file}") for chunk_index, dataframe_chunk in enumerate( pd.read_csv(file, chunksize=batch_size)): if verbose: log(f"{file}: processing batch # {chunk_index + 1}") dataframe_chunk["_id"] = dataframe_chunk["source_id"].apply( lambda x: str(x)) batch = dataframe_chunk.fillna("DROPMEPLEASE").to_dict( orient="records") # pop nulls - save space batch = [{ key: value for key, value in document.items() if value not in ("DROPMEPLEASE", "NOT_AVAILABLE") } for document in batch] bad_document_indexes = [] for document_index, document in enumerate(batch): try: # GeoJSON for 2D indexing document["coordinates"] = dict() # string format: H:M:S, D:M:S document["coordinates"]["radec_str"] = [ deg2hms(document["ra"]), deg2dms(document["dec"]), ] # for GeoJSON, must be lon:[-180, 180], lat:[-90, 90] (i.e. in deg) _radec_geojson = [document["ra"] - 180.0, document["dec"]] document["coordinates"]["radec_geojson"] = { "type": "Point", "coordinates": _radec_geojson, } except Exception as e: if verbose: log(str(e)) bad_document_indexes.append(document_index) if len(bad_document_indexes) > 0: if verbose: log("Removing bad docs") for index in sorted(bad_document_indexes, reverse=True): del batch[index] # ingest batch mongo.insert_many(collection=collection, documents=batch) # disconnect from db: try: mongo.client.close() finally: if verbose: log("Successfully disconnected from db") # clean up: if rm: os.remove(file) if verbose: log(f"Successfully removed {file}")
def clean_up_document(group): """ Format passed in dicts for Mongo insertion """ document = {} for k, v in group.items(): if k == "matchedSourceID": document[k] = group[k] continue if k in sources_int_fields: document[k] = [ int(group[k][key2]) for key2 in group[k].keys() ] else: document[k] = [ float(group[k][key2]) for key2 in group[k].keys() ] # document["ra"] = document["ra"][0] # document["dec"] = document["dec"][0] # generate unique _id: document["_id"] = baseid + document["matchedSourceID"] document["filter"] = filt document["field"] = field document["ccd"] = ccd # GeoJSON for 2D indexing document["coordinates"] = dict() _ra = np.median(document["ra"]) _dec = np.median(document["dec"]) _radec_str = [deg2hms(_ra), deg2dms(_dec)] document["coordinates"]["radec_str"] = _radec_str # for GeoJSON, must be lon:[-180, 180], lat:[-90, 90] (i.e. in deg) _radec_geojson = [_ra - 180.0, _dec] document["coordinates"]["radec_geojson"] = { "type": "Point", "coordinates": _radec_geojson, } document["data"] = [] for t, m, e, f, _ra, _dec in zip( document["mjd"], document["mag"], document["magErr"], document["ipacFlags"], document["ra"], document["dec"], ): data_point = { "mjd": t, "mag": m, "magerr": e, "ipacflags": f, "ra": _ra, "dec": _dec, } # convert types for pymongo: for k, v in data_point.items(): if k in sourcedata_int_fields: data_point[k] = int(data_point[k]) else: data_point[k] = float(data_point[k]) if k == "mjd": data_point[k] = round(data_point[k], 5) elif k not in ("ra", "dec"): data_point[k] = round(data_point[k], 3) document["data"].append(data_point) del ( document["mjd"], document["mag"], document["magErr"], document["ipacFlags"], document["ra"], document["dec"], ) document["data"].sort(key=lambda x: x["mjd"]) return document