Exemplo n.º 1
0
def main():
    """
    Bootstrap the update process by wrapping the initialization and termination
    of logging and database access.

    Errors raised by tasks are caught here and logged, and the script is
    immediately killed.
    """
    initialize_logging('../logs/update.log')
    logger = logging.getLogger()
    engine, session = database.initialize('sqlite:///../data/isrid-master.db')

    tasks = [augment_weather_instances]

    for task in tasks:
        try:
            task_name = task.__name__.replace('_', ' ')
            logger.info('Starting task: {}'.format(task_name))
            task(session)

        except KeyboardInterrupt:
            print()
            logger.info('Terminating update ... ')
            break

        except Exception as error:
            logger.error('{}: {}'.format(type(error).__name__, error))
            break

    logging.shutdown()  # Flush files
    database.terminate(engine, session)
Exemplo n.º 2
0
def execute():
    warnings.filterwarnings("ignore")
    initialize_logging("../logs/merge.log", "a+")

    logger = logging.getLogger()
    engine, session = database.initialize("sqlite:///../data/isrid-master.db")

    with open("../data/mappings.yaml") as mappings_file:
        mappings = yaml.load(mappings_file.read())

    for filename in os.listdir("../data/"):
        if filename.endswith(".xlsx"):
            for title, rows in read_excel(os.path.join("../data/", filename)):
                procedure = Registry.retrieve(filename, title)
                procedure = procedure or Registry.retrieve(filename)
                mapping = mappings.get(filename, {}).get(title, {})

                if procedure:
                    message = "Merging '{}' from '{}' ... "
                    logger.info(message.format(title, filename))
                    labels = list(next(rows))

                    if labels.count("Equipment4") > 1:
                        index = labels[::-1].index("Equipment4")
                        labels[-index - 1] = "Equipment5"

                    for index, row in enumerate(rows):
                        labeled_row = dict(zip(labels, row))
                        for model in procedure(index, labeled_row, mapping):
                            session.add(model)

                    session.commit()

    logging.shutdown()
    database.terminate(engine, session)
Exemplo n.º 3
0
def read_time_data(url):
    """
    Read the time of day each incident occurred at.

    Arguments:
        url: A string representing the path to the database.

    Returns:
        A `pandas` dataframe with two columns: `time` and `hour`. `time`
        contains Python `datetime.time` objects with times at midnight filtered
        out (most of these indicate a date was available, but not time). `hour`
        is `time` in hours (a float between 0 and 24, exclusive).

        `time` is derived from `Incident.datetime`.
    """
    engine, session = database.initialize(url)
    df = tabulate(session.query(Incident.datetime))
    database.terminate(engine, session)

    df = df.assign(time=[datetime.time() for datetime in df.datetime])
    df = df[df.time != datetime.time(0)]
    df = df.assign(hour=[time.hour + time.minute/60 + time.second/3600
                         for time in df.time])

    return df
Exemplo n.º 4
0
def read_data(url):
    engine, session = database.initialize(url)

    query = session.query(Incident.total_hours, Subject.survived,
                          Group.category).join(Group, Subject)
    df = tabulate(query)

    database.terminate(engine, session)

    return df
Exemplo n.º 5
0
def read_time_data(url):
    engine, session = database.initialize(url)
    df = tabulate(session.query(Incident.datetime))
    database.terminate(engine, session)

    df = df.assign(time=[datetime.time() for datetime in df.datetime])
    df = df[df.time != datetime.time(0)]
    df = df.assign(hour=[time.hour + time.minute/60 + time.second/3600
                         for time in df.time])

    return df
Exemplo n.º 6
0
def read_simple_data(url, exclude_singles=False, exclude_groups=False):
    """
    Read incident duration, survival, and category data. A useful shorthand.

    Arguments:
        url: A string representing the database URL to connect to.
        exclude_singles: A boolean indicating whether the query should exclude
                         subjects from groups with exactly one member.
        exclude_groups: A boolean indicating whether the query should exclude
                        subjects from groups with more than one members.

    Returns:
        A pandas dataframe containing the lost person data. The columns include
        `total_hours`, `survived`, `category`, `days` (the incident duration in
        days, as taken from `total_hours`), and `doa` (a boolean that is `True`
        is the subject did not survive). Cases with a negative timedelta
        `Incident.total_hours` are filtered out.

    Warning:
        If `exclude_singles` is `True` or `exclude_groups` is `True`, the
        function also needs to query the size of each `Group`, which may take
        a while (perhaps a minute).
    """
    engine, session = database.initialize(url)

    columns = Incident.total_hours, Subject.survived, Group.category, Group.id
    query = session.query(*columns).join(Group, Subject)
    df = tabulate(query)

    database.terminate(engine, session)

    if exclude_singles or exclude_groups:
        df['size'] = [Group.query.get(int(id_)).size for id_ in df.id]  # Hack
    if exclude_singles:
        df = df[df['size'] > 1]
    if exclude_groups:
        df = df[df['size'] == 1]

    if 'size' in df:
        df.drop('size', 1, inplace=True)
    df.drop('id', 1, inplace=True)

    df = df.assign(days=[total_hours.total_seconds()/3600/24
                         for total_hours in df.total_hours],
                   doa=[not survived for survived in df.survived])
    df = df[0 <= df.days]

    return df
Exemplo n.º 7
0
def read_data(url, *columns, not_null=True):
    engine, session = database.initialize(url)

    query = session.query(*columns).join(Group, Incident)
    query = query.filter(*map(lambda column: column != None, columns))

    database.terminate(engine, session)

    data = pd.DataFrame()

    for column in columns:
        name, datatype = str(column).split(".")[-1], column.type.python_type
        values = (value for value, *empty in query.from_self(column))

        if datatype == datetime.timedelta:
            datatype = float
            values = map(lambda value: value.total_seconds() / 3600, values)

        data[name] = np.fromiter(values, np.dtype(datatype))

    return data
Exemplo n.º 8
0
def loop():
    """
    Read and evaluate expressions provided by the user.
    """
    engine, session = database.initialize('sqlite:///../data/isrid-master.db')
    print('Shell initialized at: {}'.format(datetime.datetime.now()))

    cmd = 1  # You can change your prompt to include the command number
    while True:
        try:
            expression = input('[!] ').strip()
            if len(expression) == 0:
                continue
            print(' =>', eval(expression))
        except (KeyboardInterrupt, EOFError):
            print()
            break
        except Exception as error:
            print(' => {}: {}'.format(type(error).__name__, error))
        finally:
            cmd += 1

    database.terminate(engine, session)  # Cleanly shut down SQLAlchemy
Exemplo n.º 9
0
def execute():
    initialize_logging('../logs/update.log')
    logger = logging.getLogger()
    engine, session = database.initialize('sqlite:///../data/isrid-master.db')

    tasks = [augment_weather_instances]

    for task in tasks:
        try:
            task_name = task.__name__.replace('_', ' ')
            logger.info('Starting task: {}'.format(task_name))
            task(session)

        except KeyboardInterrupt:
            print()
            logger.info('Terminating update ... ')
            break

        except Exception as error:
            logger.error('{}: {}'.format(type(error).__name__, error))
            break

    logging.shutdown()
    database.terminate(engine, session)
Exemplo n.º 10
0
 def tearDown(self):
     database.terminate(self.engine, self.session)
Exemplo n.º 11
0
#!/usr/bin/env python3

import matplotlib.pyplot as plt
import numpy as np
import Orange
from pomegranate import *

import database
from database.models import Subject, Group, Incident, Weather
from database.processing import survival_rate, tabulate, export_as_orange


engine, session = database.initialize('sqlite:///../data/isrid-master.db')

query = session.query(Subject.survived, Group.size, Weather.avg_temp)
query = query.join(Group, Incident, Weather)
df = tabulate(query, [True, True, True])

database.terminate(engine, session)


print(sum(df.survived)/len(df))
Exemplo n.º 12
0
def terminate_session(error):
    if database_initialized():
        app.logger.info('Database terminated')
        database.terminate(g.engine, g.session)
Exemplo n.º 13
0
def execute():
    matplotlib.rc("font", size=20)

    engine, session = database.initialize("sqlite:///../data/isrid-master.db")

    # Query with Group.size may take awhile, at least for Charles
    # Not sure why
    query = session.query(Incident.total_hours, Subject.survived, Group.category, Group.size).join(Group, Subject)
    print("Tabulating query... may take awhile for unknown reasons.")
    df = tabulate(query)
    print("Done tabulating.")
    print(df.describe())
    database.terminate(engine, session)

    df = df.assign(
        days=[total_hours.total_seconds() / 3600 / 24 for total_hours in df.total_hours],
        doa=[not survived for survived in df.survived],
    )
    df = df[0 <= df.days]

    rows, columns = 2, 2
    grid, axes = plt.subplots(rows, columns, figsize=(15, 10))

    categories = Counter(df.category)
    plot = 0
    kmfs = []
    options = {"show_censors": True, "censor_styles": {"marker": "|", "ms": 6}, "censor_ci_force_lines": False}

    for category, count in categories.most_common()[: rows * columns]:
        print("Category:", category)
        ax = axes[plot // columns, plot % columns]
        df_ = df[df.category == category]
        N, Ndoa = len(df_), sum(df_.doa)
        Srate = 100 * (1 - Ndoa / N)
        grp = df_[df_.size > 1]
        sng = df_[df_.size == 1]
        kmf = KaplanMeierFitter()
        # kmf.fit(df_.days, event_observed=df_.doa, label=category)
        # kmf.plot(ax=ax, ci_force_lines=True)
        kmf.fit(grp.days, event_observed=grp.doa, label=category + " Groups")
        kmf.plot(ax=ax, **options)
        kmf.fit(sng.days, event_observed=sng.doa, label=category + " Singles")
        kmf.plot(ax=ax, **options)
        kmfs.append(kmf)

        ax.set_xlim(0, min(30, 1.05 * ax.get_xlim()[1]))
        ax.set_ylim(0, 1)
        ax.set_title("{}, N = {}, DOA = {}, {:.0f}% surv".format(category, N, Ndoa, Srate))
        ax.set_xlabel("Total Incident Time (days)")
        ax.set_ylabel("Probability of Survival")

        # ax.legend_.remove()
        # ax.grid(True)

        plot += 1

    grid.suptitle("Kaplan-Meier Survival Curves", fontsize=25)
    grid.tight_layout()
    grid.subplots_adjust(top=0.9)
    grid.savefig("../doc/figures/kaplan-meier/km-grid-large.svg", transparent=True)

    combined = plt.figure(figsize=(15, 10))
    ax = combined.add_subplot(1, 1, 1)
    for kmf in kmfs[: rows * columns]:
        kmf.plot(ci_show=False, show_censors=True, censor_styles={"marker": "|", "ms": 6}, ax=ax)

    ax.set_xlim(0, 15)
    ax.set_ylim(0, 1)
    ax.set_xlabel("Total Incident Time (days)")
    ax.set_ylabel("Probability of Survival")
    ax.set_title("Kaplan-Meier Survival Curves", fontsize=25)
    ax.grid(True)
    combined.savefig("../doc/figures/kaplan-meier/km-combined-large.svg", transparent=True)

    plt.show()
Exemplo n.º 14
0
def main():
    """
    Plot the profile (size and survival rate) of the most common categories.
    """
    ## Read data

    engine, session = database.initialize('sqlite:///../data/isrid-master.db')

    query = session.query(Subject.age, Group.category, Subject.survived)
    query = query.join(Group)
    df = tabulate(query)

    database.terminate(engine, session)

    ## Process subjects by category and age

    selected_categories = df.category.value_counts()[:10].index.tolist()
    df = df[df.category.isin(selected_categories)]
    age_bins = np.linspace(0, 100, 11)

    survival_rates = np.full((10, 10), np.nan, dtype=np.float64)
    subgroup_sizes = np.full((10, 10), 0, dtype=np.float64)
    min_subgroup_size = 10

    for category, group in df.groupby('category'):
        group.insert(len(group.columns), 'age_bin',
                     np.digitize(group.age, age_bins))

        for age_index, subgroup in group.groupby('age_bin'):
            survivals = subgroup.survived.values.tolist()
            key = age_index - 1, selected_categories.index(category)

            if len(survivals) > min_subgroup_size:
                survival_rates[key] = sum(survivals)/len(survivals)
                subgroup_sizes[key] = len(survivals)

                # Debugging
                lower, upper = age_bins[age_index - 1], age_bins[age_index]
                print('{}, {} - {} years old'.format(category, int(lower),
                      int(upper)))
                print('  Survival rate: {:.3f}%'.format(
                      100*survival_rates[key]))
                print('  Number of subjects: {}'.format(
                      int(subgroup_sizes[key])))

    ## Plot survival rates and subgroup sizes

    canvas = plt.matshow(survival_rates, fignum=False, cmap='RdYlGn',
                         origin='lower')
    colorbar = plt.colorbar(canvas)
    colorbar.solids.set_edgecolor('face')
    colorbar.set_label('Survival Rate')

    x_positions = y_positions = np.arange(0, 10)
    for x in x_positions:
        for y in y_positions:
            plt.text(x, y, int(subgroup_sizes[y, x]) or '',
                     horizontalalignment='center', verticalalignment='center')

    plt.title('Lost Person Category Profiles')
    plt.ylabel('Age (years)')
    plt.xlabel('Category')

    ax = plt.gca()
    ax.xaxis.tick_bottom()
    plt.yticks(np.linspace(0, 10, 11) - 0.5, age_bins.astype(np.int))
    plt.xticks(x_positions, selected_categories, rotation=60)
    plt.subplots_adjust(bottom=0.2)
    plt.tight_layout()
    plt.savefig('../doc/figures/subject-data/category-profiles.svg',
                transparent=True)
    plt.show()