def get_reformatted_wells(sheet, names): """Get replated Rapid data from Google sheet.""" table, _ = splitext(sheet) csv_path = util.TEMP_DATA / f'{table}.csv' google.sheet_to_csv(sheet, csv_path) wells = pd.read_csv(csv_path, header=0, na_filter=False, names=names) wells['sample_id'] = wells['sample_id'].str.lower() # wells = wells.drop_duplicates('sample_id', keep=False) return wells.loc[wells['source_plate'] != '', :].copy()
def ingest_corrales_data(): """Process the Corrales data.""" csv_path = util.TEMP_DATA / 'corrales.csv' cxn = db.connect() google.sheet_to_csv(util.CORRALES_SHEET, csv_path) corrales = pd.read_csv(csv_path) corrales.corrales_id = corrales.corrales_id.str.lower() corrales.image_file = corrales.image_file.apply(util.normalize_file_name) create_corrales_data_table(cxn, corrales) merge_into_images(cxn)
def ingest_priority_taxa_list(): """Read the priority taxa list Google sheet.""" csv_path = util.TEMP_DATA / 'priority_taxa.csv' cxn = db.connect() google.sheet_to_csv(util.PRIORITY_TAXA_SHEET, csv_path) taxa = pd.read_csv( csv_path, header=0, na_filter=False, names=['family', 'subclade', 'genus', 'position', 'priority']) create_priority_taxa_table(cxn, taxa)
def ingest_loci_data(): """Read the Genbank loci Google sheet.""" csv_path = util.TEMP_DATA / 'genbank_loci.csv' cxn = db.connect() google.sheet_to_csv(util.GENBANK_LOCI_SHEET, csv_path) loci = pd.read_csv( csv_path, header=0, names=['sci_name', 'its', 'atpb', 'matk', 'matr', 'rbcl']) loci.sci_name = loci.sci_name.str.split().str.join(' ') create_genbank_loci_table(cxn, loci)
def get_sample_sheet(google_sheet): """Get sample data from Google sheet.""" csv_path = util.TEMP_DATA / f'{google_sheet}.csv' google.sheet_to_csv(google_sheet, csv_path) sample_sheet = pd.read_csv(csv_path, header=0, na_filter=False, names=[ 'sample_code', 'sample_id', 'i5_barcode_seq', 'i7_barcode_seq', 'seq_file', 'seq_cycle' ]) return sample_sheet.loc[sample_sheet['sample_code'] != '', :]
def ingest_pilot_data(): """Process the pilot data.""" csv_path = util.TEMP_DATA / 'pilot.csv' cxn = db.connect() google.sheet_to_csv(util.PILOT_DATA_SHEET, csv_path) pilot = pd.read_csv(csv_path) # Create a fake path for the file name pilot['image_file'] = pilot['File'].apply( lambda x: f'{util.PILOT_DATA_DIR}/{x}') pilot = (pilot.drop(['File'], axis=1).rename(columns={'Identifier': 'pilot_id'})) pilot.pilot_id = pilot.pilot_id.str.lower().str.split().str.join(' ') create_pilot_data_table(cxn, pilot) merge_into_images(cxn)
def get_sequencing_sheet(google_sheet): """Get replated Rapid data from Google sheet.""" csv_path = util.TEMP_DATA / f'{google_sheet}.csv' google.sheet_to_csv(google_sheet, csv_path) seq_sheet = pd.read_csv( csv_path, header=0, na_filter=False, names=[ 'well', 'sample_id', 'local_plate', 'local_well', 'family', 'genus', 'sci_name', 'total_dna', 'on_target_reads', 'total_reads', 'on_target_percent', 'dedup_percent', 'loci_assembled', 'hit_genus', 'hit_species', 'sample_genus', 'sample_species', 'result', 'taxon_id_action' ]) return seq_sheet
def get_taxonomy(google_sheet): """Get the master taxonomy google sheet.""" csv_path = util.TEMP_DATA / f'{google_sheet}.csv' google.sheet_to_csv(google_sheet, csv_path) taxonomy = pd.read_csv(csv_path, header=0, names=[ 'column_a', 'family', 'sci_name', 'authority', 'synonyms', 'sample_ids', 'provider_acronym', 'provider_id', 'quality_notes' ]) taxonomy = taxonomy[taxonomy.sci_name.notna()] taxonomy.sci_name = \ taxonomy.sci_name.str.split().str.join(' ').str.capitalize() taxonomy['genus'] = taxonomy.sci_name.str.split().str[0] return taxonomy
def ingest_samples(): """ Get the Sample plates from the Google sheet. There is a fixed format to the plates: Plate column 1 ... Plate column 12 plate_id: UUID entry_date: ISO_Date local_id: Text rapid_plates:Text notes: Text results: Text Plate row A UUID? ... UUID? . . ... . . . ... . . . ... . Plate row H UUID? ... UUID? Method: 1) We look for something in the plate_data column that is a UUID. 2) We then take that row and the next n rows. """ csv_path = util.TEMP_DATA / 'sample_plates.csv' google.sheet_to_csv(util.SAMPLE_PLATES_SHEET, csv_path) sample_wells = [] with open(csv_path, newline='') as csv_file: reader = csv.reader(csv_file) for csv_row in reader: if util.is_uuid(csv_row[0]): header = build_headers(csv_row, reader) build_wells(header, reader, sample_wells) sample_wells = pd.DataFrame(sample_wells) write_to_db(sample_wells)