def get_reformatted_wells(sheet, names):
    """Get replated Rapid data from Google sheet."""
    table, _ = splitext(sheet)
    csv_path = util.TEMP_DATA / f'{table}.csv'

    google.sheet_to_csv(sheet, csv_path)

    wells = pd.read_csv(csv_path, header=0, na_filter=False, names=names)
    wells['sample_id'] = wells['sample_id'].str.lower()
    # wells = wells.drop_duplicates('sample_id', keep=False)

    return wells.loc[wells['source_plate'] != '', :].copy()
def ingest_corrales_data():
    """Process the Corrales data."""
    csv_path = util.TEMP_DATA / 'corrales.csv'
    cxn = db.connect()

    google.sheet_to_csv(util.CORRALES_SHEET, csv_path)
    corrales = pd.read_csv(csv_path)
    corrales.corrales_id = corrales.corrales_id.str.lower()
    corrales.image_file = corrales.image_file.apply(util.normalize_file_name)

    create_corrales_data_table(cxn, corrales)

    merge_into_images(cxn)
示例#3
0
def ingest_priority_taxa_list():
    """Read the priority taxa list Google sheet."""
    csv_path = util.TEMP_DATA / 'priority_taxa.csv'
    cxn = db.connect()

    google.sheet_to_csv(util.PRIORITY_TAXA_SHEET, csv_path)

    taxa = pd.read_csv(
        csv_path,
        header=0,
        na_filter=False,
        names=['family', 'subclade', 'genus', 'position', 'priority'])

    create_priority_taxa_table(cxn, taxa)
示例#4
0
def ingest_loci_data():
    """Read the Genbank loci Google sheet."""
    csv_path = util.TEMP_DATA / 'genbank_loci.csv'
    cxn = db.connect()

    google.sheet_to_csv(util.GENBANK_LOCI_SHEET, csv_path)

    loci = pd.read_csv(
        csv_path,
        header=0,
        names=['sci_name', 'its', 'atpb', 'matk', 'matr', 'rbcl'])

    loci.sci_name = loci.sci_name.str.split().str.join(' ')

    create_genbank_loci_table(cxn, loci)
示例#5
0
def get_sample_sheet(google_sheet):
    """Get sample data from Google sheet."""
    csv_path = util.TEMP_DATA / f'{google_sheet}.csv'

    google.sheet_to_csv(google_sheet, csv_path)

    sample_sheet = pd.read_csv(csv_path,
                               header=0,
                               na_filter=False,
                               names=[
                                   'sample_code', 'sample_id',
                                   'i5_barcode_seq', 'i7_barcode_seq',
                                   'seq_file', 'seq_cycle'
                               ])

    return sample_sheet.loc[sample_sheet['sample_code'] != '', :]
示例#6
0
def ingest_pilot_data():
    """Process the pilot data."""
    csv_path = util.TEMP_DATA / 'pilot.csv'
    cxn = db.connect()

    google.sheet_to_csv(util.PILOT_DATA_SHEET, csv_path)
    pilot = pd.read_csv(csv_path)

    # Create a fake path for the file name
    pilot['image_file'] = pilot['File'].apply(
        lambda x: f'{util.PILOT_DATA_DIR}/{x}')

    pilot = (pilot.drop(['File'],
                        axis=1).rename(columns={'Identifier': 'pilot_id'}))
    pilot.pilot_id = pilot.pilot_id.str.lower().str.split().str.join(' ')

    create_pilot_data_table(cxn, pilot)

    merge_into_images(cxn)
示例#7
0
def get_sequencing_sheet(google_sheet):
    """Get replated Rapid data from Google sheet."""
    csv_path = util.TEMP_DATA / f'{google_sheet}.csv'

    google.sheet_to_csv(google_sheet, csv_path)

    seq_sheet = pd.read_csv(
        csv_path,
        header=0,
        na_filter=False,
        names=[
            'well', 'sample_id', 'local_plate', 'local_well', 'family',
            'genus', 'sci_name', 'total_dna', 'on_target_reads', 'total_reads',
            'on_target_percent', 'dedup_percent', 'loci_assembled',
            'hit_genus', 'hit_species', 'sample_genus', 'sample_species',
            'result', 'taxon_id_action'
        ])

    return seq_sheet
示例#8
0
def get_taxonomy(google_sheet):
    """Get the master taxonomy google sheet."""
    csv_path = util.TEMP_DATA / f'{google_sheet}.csv'

    google.sheet_to_csv(google_sheet, csv_path)

    taxonomy = pd.read_csv(csv_path,
                           header=0,
                           names=[
                               'column_a', 'family', 'sci_name', 'authority',
                               'synonyms', 'sample_ids', 'provider_acronym',
                               'provider_id', 'quality_notes'
                           ])
    taxonomy = taxonomy[taxonomy.sci_name.notna()]

    taxonomy.sci_name = \
        taxonomy.sci_name.str.split().str.join(' ').str.capitalize()
    taxonomy['genus'] = taxonomy.sci_name.str.split().str[0]

    return taxonomy
def ingest_samples():
    """
    Get the Sample plates from the Google sheet.

    There is a fixed format to the plates:
                           Plate column 1  ...     Plate column 12
    plate_id:    UUID
    entry_date:  ISO_Date
    local_id:    Text
    rapid_plates:Text
    notes:       Text
    results:     Text
    Plate row A                UUID?          ...     UUID?
        .                        .            ...       .
        .                        .            ...       .
        .                        .            ...       .
    Plate row H                UUID?          ...     UUID?

    Method:
    1) We look for something in the plate_data column that is a UUID.
    2) We then take that row and the next n rows.
    """
    csv_path = util.TEMP_DATA / 'sample_plates.csv'
    google.sheet_to_csv(util.SAMPLE_PLATES_SHEET, csv_path)

    sample_wells = []

    with open(csv_path, newline='') as csv_file:
        reader = csv.reader(csv_file)
        for csv_row in reader:
            if util.is_uuid(csv_row[0]):
                header = build_headers(csv_row, reader)
                build_wells(header, reader, sample_wells)

    sample_wells = pd.DataFrame(sample_wells)

    write_to_db(sample_wells)