Exemplo n.º 1
0
def load(expression_tsv, experiment_md=None):
    "if experiment_md is provide, validate samples"
    try:
        with gopen(expression_tsv) as fh:
            # na_value handling is needed to detect empty cells and short
            # rows, preventing implicit conversion to NaN.
            df = pd.read_csv(fh,
                             dialect=csv.excel_tab,
                             header=0,
                             converters={"ID": str},
                             na_values=na_values,
                             keep_default_na=False)
            expression_mat = ExpressionMatrix(df)
            validate_header(expression_mat)
            expression_mat.df.set_index("ID",
                                        drop=False,
                                        inplace=True,
                                        verify_integrity=True)
            validate_data(expression_mat)
            if experiment_md is not None:
                validate_replicates(experiment_md, expression_mat)
        return expression_mat
    except (LrgaspException, pd.errors.ParserError, pd.errors.EmptyDataError,
            ValueError) as ex:
        raise LrgaspException(
            f"Parse of expression matrix TSV failed: {expression_tsv}") from ex
def experiment_load(experiment_json):
    """load and validate experiment metadata"""
    try:
        with gopen(experiment_json) as fh:
            experiment = json.load(fh, object_pairs_hook=ObjDict)
    except json.decoder.JSONDecodeError as ex:
        raise LrgaspException(f"parse of experiment metadata (JSON) failed: {experiment_json}") from ex
    try:
        experiment_validate(experiment)
    except LrgaspException as ex:
        raise LrgaspException(f"validation of experiment metadata failed: {experiment_json}") from ex
    return experiment
def load(model_map_tsv):
    read_model_map = ReadModelMap()
    try:
        with gopen(model_map_tsv) as fh:
            for pair in _tsv_reader(fh):
                read_model_map.add(pair)
        if len(read_model_map) == 0:
            raise LrgaspException("TSV contains no data")
        return read_model_map
    except (LrgaspException, FileNotFoundError, csv.Error) as ex:
        raise LrgaspException("Parse of reads-to-models TSV failed: {}".format(
            model_map_tsv)) from ex
Exemplo n.º 4
0
def load(entry_json):
    """load and validate entry metadata"""
    try:
        with gopen(entry_json) as fh:
            entry_md = json.load(fh, object_pairs_hook=ObjDict)
    except (json.decoder.JSONDecodeError, UnicodeDecodeError) as ex:
        raise LrgaspException(
            f"parse of entry metadata (JSON) failed: {entry_json}") from ex
    try:
        entry_validate(entry_md)
    except LrgaspException as ex:
        raise LrgaspException(
            f"validation of entry metadata failed: {entry_json}") from ex
    entry_md.experiments = None
    return entry_md
Exemplo n.º 5
0
def expr_mat_load(expr_mat_tsv):
    try:
        check_row_consistency(expr_mat_tsv)
        with gopen(expr_mat_tsv) as fh:
            expr_mat = pd.read_csv(fh,
                                   dialect=csv.excel_tab,
                                   header=0,
                                   converters={"ID": str})
        validate(expr_mat)
        return expr_mat
    except (LrgaspException, pd.errors.ParserError,
            pd.errors.EmptyDataError) as ex:
        raise LrgaspException(
            "Parse of expression matrix TSV failed: {}".format(
                expr_mat_tsv)) from ex
def check_row_consistency(expression_tsv):
    """Pandas just pads or ignores inconsistent number of columns,
    so to validate the matrix is consistent, we need to make another pass.
    """
    minLen = maxLen = None
    with gopen(expression_tsv) as fh:
        for row in csv.reader(fh, dialect=csv.excel_tab):
            if minLen is None:
                minLen = maxLen = len(row)
            else:
                minLen = min(len(row), minLen)
                maxLen = max(len(row), maxLen)
    if minLen is None:
        raise LrgaspException("TSV is empty")
    if minLen != maxLen:
        raise LrgaspException(
            f"TSV has an inconsistent number of columns: min={minLen}, max={maxLen}"
        )
def load(expression_tsv):
    try:
        check_row_consistency(expression_tsv)
        with gopen(expression_tsv) as fh:
            expression = pd.read_csv(fh,
                                     dialect=csv.excel_tab,
                                     header=0,
                                     converters={"ID": str})
            validate_header(expression)
            expression.set_index("ID",
                                 drop=False,
                                 inplace=True,
                                 verify_integrity=True)
            validate_data(expression)
        return expression
    except (LrgaspException, pd.errors.ParserError, pd.errors.EmptyDataError,
            ValueError) as ex:
        raise LrgaspException(
            "Parse of expression matrix TSV failed: {}".format(
                expression_tsv)) from ex
Exemplo n.º 8
0
def load(entry_json):
    """load and validate entry metadata"""
    try:
        with gopen(entry_json) as fh:
            entry_md = json.load(fh, object_pairs_hook=ObjDict)
    except (json.decoder.JSONDecodeError, UnicodeDecodeError) as ex:
        raise LrgaspException(
            f"parse of entry metadata (JSON) failed: {entry_json}") from ex
    try:
        check_from_defs("entry", entry_fields, entry_md)
        # add non-serialized fields after field check
        entry_md.experiments = None
        entry_md.entry_dir = None
        entry_md.entry_json = entry_json
        entry_validate(entry_md)
    except LrgaspException as ex:
        raise LrgaspException(
            f"validation of entry metadata failed: {entry_json}") from ex

    return entry_md