def process_metadata_import(dataset_accession, dryrun=False, massive_host=None): dataset_metadatum = find_dataset_metadata(dataset_accession, useftp=True, massive_host=massive_host) if dataset_metadatum == None: print("Not Importing %s, no metadata" % dataset_accession) return -2, -2 else: print("Importing %s" % dataset_accession, dataset_metadatum) #Save files Locally local_metadata_path = os.path.join("tempuploads", dataset_accession + ".tsv") try: massive_host.download(dataset_metadatum["path"], local_metadata_path) except: print("CANT DOWNLOAD", dataset_metadatum["path"]) raise metadata_validator.rewrite_metadata(local_metadata_path) #Validate pass_validation, failures, errors_list, valid_rows, total_rows_count = metadata_validator.perform_validation(local_metadata_path) #Filtering out lines that are not valid local_filtered_metadata_path = os.path.join("tempuploads", "filtered_" + dataset_accession + ".tsv") if len([error for error in errors_list if error["error_string"].find("Missing column") != -1]) > 0: print("Missing Columns, Rejected") return -1, -1 #Filtering out lines that do not match the dataset accession metadata_df = pd.DataFrame(valid_rows) try: metadata_df = metadata_df[metadata_df['MassiveID'] == dataset_accession] except: metadata_df = pd.DataFrame(valid_rows) metadata_df.to_csv(local_filtered_metadata_path, sep="\t", index=False) try: pass_validation, failures, errors_list, valid_rows, total_rows_count = metadata_validator.perform_validation(local_filtered_metadata_path) except: pass_validation = False added_files_count = 0 if pass_validation: print("Importing Data") if not dryrun: added_files_count = populate_metadata.populate_dataset_metadata(local_filtered_metadata_path, massive_host=massive_host) else: print("Filtered File is not valid") return len(metadata_df), added_files_count
def main(): parser = argparse.ArgumentParser(description='Importing Database') parser.add_argument('--importmetadata', default=None, help='Imports metadata, options are all, dataset, file') parser.add_argument('--metadatafile', help='Imports metadata filename') parser.add_argument('--metadataaccession', help='Imports metadata accession') parser.add_argument('--importidentifications', default=None, help='Imports identifications, from task file') parser.add_argument('--identifications_output', help='identifications_output') massive_host = ftputil.FTPHost("massive.ucsd.edu", "anonymous", "") args = parser.parse_args() # Importing Metadata First if args.importmetadata == "all": summary_list = [] all_datasets = ming_proteosafe_library.get_all_datasets() for dataset in all_datasets: if not "GNPS" in dataset["title"].upper(): continue # Checking the FTP host try: list_names = massive_host.listdir("/") except Exception as e: print("MassIVE connection broken, reconnecting", e) massive_host = ftputil.FTPHost("massive.ucsd.edu", "anonymous", "") try: print("Importing, ", dataset["dataset"]) total_valid_metadata_entries, files_added = process_metadata_import(dataset["dataset"], dryrun=False, massive_host=massive_host) except KeyboardInterrupt: raise except: total_valid_metadata_entries = -1 files_added = -1 summary_dict = {} summary_dict["total_valid_metadata_entries"] = total_valid_metadata_entries summary_dict["files_added"] = files_added summary_dict["accession"] = dataset["dataset"] summary_list.append(summary_dict) try: pd.DataFrame(summary_list).to_csv("/app/database/add_metadata_summary.tsv", sep="\t", index=False) except: continue elif args.importmetadata == "dataset": total_valid_metadata_entries, files_added = process_metadata_import(args.metadataaccession, massive_host=massive_host) print(total_valid_metadata_entries, files_added) elif args.importmetadata == "file": populate_metadata.populate_dataset_metadata(args.metadatafile, massive_host=massive_host) # Import Library Identifications if args.importidentifications is not None: import_identification(args.importidentifications, args.identifications_output, force=True)
def main(): parser = argparse.ArgumentParser(description='Importing Database') parser.add_argument( '--importmetadata', default=None, help='Imports metadata, options are all, dataset, file') parser.add_argument('--metadatafile', help='Imports metadata filename') parser.add_argument('--metadataaccession', help='Imports metadata accession') parser.add_argument('--importidentifications', default=None, help='Imports identifications, from task file') parser.add_argument('--identifications_output', help='identifications_output') args = parser.parse_args() # Importing Metadata First if args.importmetadata == "all": summary_list = [] all_datasets = ming_proteosafe_library.get_all_datasets() for dataset in all_datasets: if not "GNPS" in dataset["title"].upper(): continue try: total_valid_metadata_entries, files_added = process_metadata_import( dataset["dataset"], dryrun=False) except KeyboardInterrupt: raise except: total_valid_metadata_entries = -1 files_added = -1 summary_dict = {} summary_dict[ "total_valid_metadata_entries"] = total_valid_metadata_entries summary_dict["files_added"] = files_added summary_dict["accession"] = dataset["dataset"] summary_list.append(summary_dict) try: pd.DataFrame(summary_list).to_csv( "/app/database/add_metadata_summary.tsv", sep="\t", index=False) except: continue elif args.importmetadata == "dataset": total_valid_metadata_entries, files_added = process_metadata_import( args.metadataaccession) print(total_valid_metadata_entries, files_added) elif args.importmetadata == "file": populate_metadata.populate_dataset_metadata(args.metadatafile) # Import Library Identifications if args.importidentifications is not None: import_identification(args.importidentifications, args.identifications_output)