Exemplo n.º 1
0
    def filterByCutoff(self, cutoff=None):
        """ keep only results that pass Chebyshev cutoff """
        rr = RunRecord('filterByCutoff')

        rr.addInfo('Starting no. of genes', self.data_collection.N)

        # exclude outlier genes using one-sided Chebyshev
        if cutoff is not None and cutoff != 0.0:
            try:
                cutoff = float(cutoff)
                if cutoff < 0.0 or cutoff >= 1.0:
                    rr.addError('Cutoff out of range', cutoff)
                    rr.addInfo('Cutoff set to default', 0.05)
                    cutoff = 0.05
            except ValueError:
                rr.addError('Cutoff not given as float', cutoff)
                rr.addInfo('Cutoff set to default', 0.05)
                cutoff = 0.05
                # Do Chebyshev filtering

            self.data_collection =\
                    self.data_collection.filteredChebyshevUpper(p=cutoff)
            rr.addInfo('Used Chebyshev filter cutoff', cutoff)
            rr.addInfo('No. genes after normalisation filter',
                       self.data_collection.N)
        else:
            rr.addInfo('Outlier cutoff filtering', 'Off')

        if self.data_collection is None or\
                self.data_collection.ranks.max() == 0:
            rr.dieOnCritical('No data after filtering', 'Failure')
Exemplo n.º 2
0
def get_chroms(session):
    """ return list of chroms from ',' separated string """
    if session is None:
        return ['No connection to DB']
    elif type(session) is str:
        session = make_session(session)
    rr = RunRecord('get_chroms')
    try:
        chroms = session.query(Chroms).one()
        chroms = chroms.chromStr.split(',')
    except NoResultFound:
        chroms = []
        rr.addError('Chroms found', None)
    return chroms
Exemplo n.º 3
0
def add_data(session,
             name,
             description,
             path,
             expr_table,
             gene_id_heading='gene',
             probeset_heading='probeset',
             expr_heading='exp',
             sample_type=sample_types['abs_expr'],
             reffile1=None,
             reffile2=None):
    """ A unified interface for adding data to the DB """
    rr = RunRecord('add_data')

    success = add_sample(session, name, description)
    if not success:
        # Check if any sample exists without data
        existing_data, existing_type = check_existing_data(session, name)
        if existing_data > 0:
            rr.addError(name + ' already has data loaded', existing_data)
            rr.addError('data of type', existing_type)
            return False
        else:
            rr.addInfo('now loading data for existing sample', name)

    # either sample was created or existed with no data, so load data now
    if sample_types[sample_type] == sample_types['abs_expr']:
        success = add_expression_study(session, name, path, expr_table)

    elif sample_types[sample_type] == sample_types['diff_expr']:
        # diff between two files, check we got the related files
        assert reffile1 is not None and reffile2 is not None,\
        'To enter differences in gene expression you must specify the 2'\
        'files that contain the absolute measures.'
        add_expression_diff_study(session, name, path, expr_table, reffile1,
                                  reffile2)

    elif sample_types[sample_type] == sample_types['target_genes']:
        add_target_genes(session,
                         name,
                         path,
                         expr_table,
                         ensembl_id_label=gene_id_heading)
    else:
        rr.dieOnCritical('Unknown sample type', sample_type)

    return success
Exemplo n.º 4
0
def main():
    rr = RunRecord('db_summary')
    rr.addCommands(sys.argv)
    args = script_info['args'].parse(window_title='DB Summary')
    session = make_session(args.db_path)
    sample_name = args.sample if args.sample else None

    chroms = get_chroms(session)
    species = get_species(session)

    if sample_name is None:
        total_samples_count = get_sample_counts(session)
        sample_names = get_all_sample_names(session)
        total_genes_count = get_gene_counts(session)
        total_exon_count = get_exon_counts(session)
        total_expr_count = get_expression_counts(session)
        total_diff_genes_count = get_diff_counts(session)
        total_target_genes_count = get_targetgene_counts(session)
        total_reffiles_count = get_reffile_counts(session)
    else:
        total_expr_count = get_expression_counts(session, sample_name)
        total_diff_genes_count = get_diff_counts(session, sample_name)
        total_target_genes_count = get_targetgene_counts(session, sample_name)
        reffiles_entries = get_reffile_entries(session,
                                               sample_name=sample_name)

    rr.addInfo('ChipPy DB name', args.db_path)
    rr.addInfo('Species name', species)
    rr.addInfo('Chroms list', chroms)
    if sample_name is None:
        rr.addInfo('Total # of sample entries', total_samples_count)
        rr.addInfo('Sample names', sample_names)
        rr.addInfo('Total # of gene entries', total_genes_count)
        rr.addInfo('Total # of exon entries', total_exon_count)
    rr.addInfo('Total # of absolute-scored gene entries', total_expr_count)
    rr.addInfo('Total # of differential gene entries', total_diff_genes_count)
    rr.addInfo('Total # of target gene entries', total_target_genes_count)
    if sample_name is None:
        rr.addInfo('Total # of reference files', total_reffiles_count)
    else:
        if len(reffiles_entries) > 0:
            rr.addInfo('Reference file name', reffiles_entries)
        else:
            rr.addError('Reference file name', 'Not Available')

    rr.display()
Exemplo n.º 5
0
def _get_targetgene_query(session, sample_name=None, biotype='protein_coding'):
    """ Returns target_gene records for a given sample """
    rr = RunRecord('get_targets')
    if sample_name is not None:
        sample = _get_sample(session, sample_name)
        if sample is None:
            rr.addError('Using all samples, as no sample matches name',
                        sample_name)
            query = session.query(TargetGene).join(Gene)
        else:
            query = session.query(TargetGene).join(Gene).\
                    filter(TargetGene.sample_id==sample.sample_id)
    else:  # get them all
        query = session.query(TargetGene).join(Gene)

    if biotype:
        query = query.filter(Gene.biotype == biotype)
    return query
Exemplo n.º 6
0
def main():
    rr = RunRecord('start_chippy_db')
    rr.addCommands(sys.argv)

    args = script_info['args'].parse()
    create_path(args.save_db_dir)

    if not os.path.isdir(args.save_db_dir):
        sys.stderr.write('The save_db_dir must be an existing directory.\n')
        return

    release = args.ensembl_release
    species = args.species
    chippy_db_name = args.save_db_prefix + '_chippy_' + str(release) +\
            '_' + species + '.db'
    db_path = os.path.join(args.save_db_dir, chippy_db_name)
    if not os.path.exists(db_path):
        session = make_session(db_path)

        hostname = args.hostname
        username = args.username
        password = args.password

        account = HostAccount(hostname, username, password, port=args.port)
        add_ensembl_gene_data(session,
                              args.species,
                              ensembl_release=args.ensembl_release,
                              account=account)

        success = create_dummy_expr(session)
        if success:
            rr.addInfo('Dummy data added successfully', 'Expr=1.')
        else:
            rr.addError('Dummy data failed to upload to DB',
                        'Expect bigger problems')

        rr.addInfo('Chippy DB written', db_path)
        print os.path.realpath(db_path)
    else:
        rr.addError('Chippy DB with this name already exists', db_path)

    if args.show_log:
        rr.display()
Exemplo n.º 7
0
 def normaliseByRPM(self):
     """ This requires 'mapped tags', 'tag count' or 'base count' to be present
         in the collection and gives counts per mapped million tags/bases.
         Mapped tags is the total experimental mapped tags.
         Tag count and base count are region specific.
     """
     rr = RunRecord('normaliseByRPM')
     try:
         norm_RPM = self.data_collection.info['args']['mapped tags']
         rr.addInfo("'mapped tags' value", norm_RPM)
     except KeyError:
         rr.addError('Info field not found', 'mapped tags')
         return
     norm_factor = 1000000.0 / norm_RPM
     rr.addInfo('normalising by RPMs', norm_factor)
     normalised_counts = []
     for c in self.data_collection.counts:
         c2 = c * norm_factor
         normalised_counts.append(c2)
     self.data_collection.counts = numpy.array(normalised_counts)
Exemplo n.º 8
0
def drop_sample_records(session, sample_name, test=False):
    """
        Drop the expression/target/expr_diff records, sample name
        and reference file attached to a given sample name.
        Returns True if sample deleted.
    """
    rr = RunRecord('drop_sample_records')
    sample = _get_sample(session, sample_name)
    if sample is not None:
        rr.addInfo('Deleting records for ', sample_name)
        num_removed = session.query(ReferenceFile).filter(
            ReferenceFile.sample_id == sample.sample_id).delete()
        rr.addInfo('Number of reference file records deleted', num_removed)
        num_removed = session.query(Expression).filter(
            Expression.sample_id == sample.sample_id).delete()
        rr.addInfo('Number of expression records deleted', num_removed)
        num_removed = session.query(ExpressionDiff).filter(
            ExpressionDiff.sample_id == sample.sample_id).delete()
        rr.addInfo('Number of expressionDiff records deleted', num_removed)
        num_removed = session.query(TargetGene).filter(
            TargetGene.sample_id == sample.sample_id).delete()
        rr.addInfo('Number of target gene records deleted', num_removed)
        num_removed = session.query(Sample).filter(
            Sample.sample_id == sample.sample_id).delete()
        assert num_removed == 1, 'Incorrect number of samples removed' + str(
            num_removed)
        rr.addInfo('Number of sample records deleted', num_removed)
        try:
            if not test:
                session.commit()
                return True
            else:
                rr.addInfo('session deletes not committed', sample_name)
        except NoResultFound:
            rr.addError('Deletes could not take place', 'Commit failed')
    else:
        rr.addError('No sample found by name', sample_name)
    session.rollback()
    return False