Exemplo n.º 1
0
def check_muta_conflict(ctx, chunksize):

    def get_seq(seq_dict, iso, pos):
        try:
            return seq_dict[iso][pos-1]
        except IndexError:
            return 'X'

    custom_db = ctx.obj['custom_db']
    root_query = """SELECT DISTINCT isoform, Pos FROM IDMapping, Mutation
        WHERE IDMapping.ftId = Mutation.ftId AND isoform != 'NaN'"""
    fetch_iso_seq_query = "SELECT isoform, sequence FROM ALTERNATIVE_PRODUCTS WHERE isoform IN ({}) AND (sequenceStatus = 'displayed' OR sequenceStatus = 'described');"
    fetch_can_seq_query = "SELECT accession, sequence FROM INFO WHERE accession IN ({}) ;"
    total = unsync_run(custom_db.database.fetch_val(query=f"SELECT COUNT(*) FROM ({root_query});"))
    console.log(f"Total {total} to query")
    with console.status("[bold green]checking..."):
        for i in range(ceil(total/chunksize)):
            unp_pos = DataFrame(unsync_run(custom_db.database.fetch_all(query=f"{root_query} LIMIT {chunksize} OFFSET {chunksize*i};")), columns=['isoform', 'Pos'])
            mask = unp_pos.isoform.str.contains('-')
            seq_dict = dict(unsync_run(Identifier.sqlite_api.database.fetch_all(
                                query=fetch_iso_seq_query.format(','.join(f"'{ix}'" for ix in set(unp_pos[mask].isoform))
                                ))) +
                            unsync_run(Identifier.sqlite_api.database.fetch_all(
                                query=fetch_can_seq_query.format(','.join(f"'{ix}'" for ix in set(unp_pos[~mask].isoform)))
                                )))
            unp_pos['Ref'] = [get_seq(seq_dict, iso, pos) for iso, pos in zip(unp_pos.isoform, unp_pos.Pos)]
            custom_db.sync_insert(custom_db.UniProtSeq, unp_pos.to_dict('records'))
            console.log(f'Done: {len(unp_pos)+chunksize*i}')
Exemplo n.º 2
0
def insert_iso_range(ctx, chunksize):
    def expand_iso_range(res):
        for UniProt, iso_range in res:
            iso_range = json.loads(iso_range)
            for start, end in iso_range:
                yield dict(UniProt=UniProt, unp_start=start, unp_end=end, resource='iso_range', resource_id=str(start))
    
    custom_db = ctx.obj['custom_db']
    proteins_db = Identifier.sqlite_api
    total = unsync_run(proteins_db.database.fetch_one(query="SELECT COUNT(*) FROM ALTERNATIVE_PRODUCTS WHERE sequenceStatus='described' AND iso_range != 'NaN'"))[0]
    console.log(f"Total {total} to query")
    for i in range(ceil(total/chunksize)):
        res = unsync_run(proteins_db.database.fetch_all(
            query=f"""
            SELECT isoform, iso_range FROM ALTERNATIVE_PRODUCTS
                WHERE sequenceStatus = 'described' AND iso_range != 'NaN'
            LIMIT {chunksize} OFFSET {chunksize*i}
            """))
        custom_db.sync_insert(custom_db.UniProtAnnotation, tuple(expand_iso_range(res)))
        console.log(f'Done: {len(res)+chunksize*i}')
Exemplo n.º 3
0
def id_mapping(ctx, input, column, sep, chunksize, auto_assign, sleep):
    sqlite_api = ctx.obj['custom_db']
    cols = ('ftId', 'Entry', 'isoform', 'is_canonical')
    Identifier.auto_assign_when_seq_conflict = auto_assign
    if input is None:
        total = unsync_run(sqlite_api.database.fetch_one(
            query="SELECT COUNT(DISTINCT ftId) FROM Mutation WHERE ftId NOT IN (SELECT DISTINCT ftId FROM IDMapping)"))[0]
        console.log(f"Total {total} to query")
        query = f"""
                SELECT DISTINCT ftId FROM Mutation
                WHERE ftId NOT IN (SELECT DISTINCT ftId FROM IDMapping)
                LIMIT {chunksize}
                """
        for index in range(ceil(total/chunksize)):
            res = unsync_run(sqlite_api.database.fetch_all(query=query))
            if len(res) == 0:
                break
            with Progress(*progress_bar_args) as p:
                res = Identifiers(i[0] for i in res).fetch('map2unp').run(p.track).result()
            values = [dict(zip(cols, i)) for i in res]
            if values:
                sqlite_api.sync_insert(sqlite_api.IDMapping, values)
            console.log(f'Done: {len(res)+chunksize*index}')
            if sleep:
                tsleep(uniform(1, 10))
    else:
        if column is None:
            ids = read_csv(input, sep=sep, header=None)[0].unique()
        else:
            ids = read_csv(input, sep=sep, usecols=[column])[column].unique()
        total = len(ids)
        console.log(f"Total {total} to query")
        for index in range(0, total, chunksize):
            with Progress(*progress_bar_args) as p:
                res = Identifiers(ids[index:index+chunksize]).fetch('map2unp').run(p.track).result()
            values = [dict(zip(cols, i)) for i in res]
            if values:
                sqlite_api.sync_insert(sqlite_api.IDMapping, values)
            console.log(f'Done: {len(res)+index}')
            if sleep:
                tsleep(uniform(1, 10))
Exemplo n.º 4
0
 def query_from_DB_with_unps(self, table_name: str, columns: str = '*'):
     default_tables = ('DB_REFERENCES', 'OTHER_DB_REFERENCES',
                       'ALTERNATIVE_PRODUCTS', 'FEATURES', 'INTERACTION',
                       'INFO')
     assert table_name in default_tables
     obs = tuple(i for i in self if i.source == 'UniProt')
     if len(obs) == 0:
         self.tasks = []
         return self
     accessions = tuple(i.identifier for i in obs)
     if columns != '*' and table_name == 'INFO':
         task = Identifier.sqlite_api.database.fetch_all(
             query=
             f'SELECT {columns} FROM INFO WHERE accession IN {accessions}')
     else:
         task = Identifier.sqlite_api.INFO.objects.filter(
             accession__in=accessions).all()
     exists = unsync_run(task)
     if len(exists) == 0:
         self.tasks = [
             ob.query_from_DB_with_unp(table_name=table_name,
                                       columns=columns,
                                       exists=False) for ob in obs
         ]
         return self
     else:
         exist_ids = frozenset(i.accession for i in exists)
         rest_ids = frozenset(accessions) - exist_ids
         rest_dfs = [
             self[accession].query_from_DB_with_unp(table_name=table_name,
                                                    columns=columns,
                                                    exists=False)
             for accession in rest_ids
         ]
         if table_name == 'INFO':
             ap = unsync_wrap(exists)
         else:
             if columns == '*':
                 ap = unsync_wrap(
                     getattr(Identifier.sqlite_api,
                             table_name).objects.filter(
                                 accession__in=exist_ids).all())
             else:
                 ap = unsync_wrap(
                     Identifier.sqlite_api.database.fetch_all(
                         query=
                         f'SELECT {columns} FROM {table_name} WHERE accession IN {tuple(exist_ids)}'
                     ))
         rest_dfs.append(ap)
         self.tasks = rest_dfs
         return self
Exemplo n.º 5
0
def sifts_mapping(ctx, input, column, sep, func, kwargs, chunksize, entry_filter, chain_filter, skip_pdbs, omit, output, iteroutput, sleep):
    def get_unp_id(args):
        Entry, isoform, is_canonical = args
        return Entry if is_canonical else isoform

    kwargs = dict(sub.split('=') for item in kwargs for sub in item.split(';'))
    if len(kwargs) > 0:
        for key,value in kwargs.items():
            kwargs[key] = eval(value)
        console.log(f"take args: {kwargs}")
    
    skip_pdbs = [pdbi for item in skip_pdbs for pdbi in item.split(',')]
    if skip_pdbs:
        kwargs['skip_pdbs'] = skip_pdbs

    SIFTS.entry_filter = entry_filter
    SIFTS.chain_filter = chain_filter
    sqlite_api = ctx.obj['custom_db']
    output = f'{func}.tsv' if output == '' else output
    output_path = ctx.obj['folder']/output
    
    if input is None:
        total = unsync_run(sqlite_api.database.fetch_one(
            query="SELECT COUNT(DISTINCT isoform) FROM IDMapping WHERE isoform != 'NaN'"))[0] - omit
        console.log(f"Total {total} to query")
        for i in range(ceil(total/chunksize)):
            res = unsync_run(sqlite_api.database.fetch_all(
                query=f"""
                SELECT DISTINCT Entry,isoform,is_canonical FROM IDMapping
                WHERE isoform != 'NaN'
                LIMIT {chunksize} OFFSET {omit+chunksize*i}
                """))
            with Progress(*progress_bar_args) as p:
                res = SIFTSs(map(get_unp_id, res)).fetch(func, **kwargs).run(p.track).result()
            for dfrm in res:
                if dfrm is None:
                    continue
                dfrm[sorted(dfrm.columns)].to_csv(output_path, sep='\t', index=False,
                            header=not output_path.exists(), mode='a+')
            console.log(f'Done: {len(res)+chunksize*i}')
            #if len(res) < chunksize:
            #    break
            if sleep and len(res) == chunksize:
                tsleep(uniform(1, 10))
    else:
        if column is None:
            ids = read_csv(input, sep=sep, header=None, skiprows=omit if omit > 0 else None)[0].unique()
        else:
            ids = read_csv(input, sep=sep, usecols=[column], skiprows=omit if omit > 0 else None)[column].unique()
        total = len(ids)
        console.log(f"Total {total} to query")
        for i in range(0, total, chunksize):
            with Progress(*progress_bar_args) as p:
                res = SIFTSs(ids[i:i+chunksize]).fetch(func, **kwargs).run(p.track).result()
            if iteroutput:
                for dfrm in res:
                    if dfrm is None:
                        continue
                    elif isinstance(dfrm, DataFrame):
                        dfrm[sorted(dfrm.columns)].to_csv(output_path, sep='\t', index=False, header=not output_path.exists(), mode='a+')
                    else:
                        pass
            else:
                DataFrame(res).to_csv(output_path, sep='\t', index=False, header=False, mode='a+')
            console.log(f'Done: {i+len(res)}')
            #if len(res) < chunksize:
            #    break
            if sleep and len(res) == chunksize:
                tsleep(uniform(1, 10))
Exemplo n.º 6
0
>>> writer(
        reader(f'http://www.ebi.ac.uk/pdbe/static/entry/download/{header}.cif.gz'), 
        f'{header}-pdbe_chain_remapping.cif',
        b'data_%s\n#\nloop_\n' % bytes(header, 'utf-8'),
        b'_pdbe_chain_remapping'
        ).result()

>>> parser(
        reader(f'http://www.ebi.ac.uk/pdbe/static/entry/download/{header}.cif.gz'),
        ('data_%s\n' % header, '#\n', 'loop_\n'),
        b'_pdbe_chain_remapping'
        )
>>> 
'''

semaphore = unsync_run(init_semaphore(10))


def iter_index(text, target, add):
    '''
    >>> text = b'sdgfsd\nsdgsdg\nfdsg\nd'
    >>> index = (None, *iter_index(text), None)
    >>> print(index)
    >>> tuple(text[start:end] for start,end in zip(index, index[1:]))
    '''
    text_len = len(text)
    start = -1
    while True:
        try:
            res = text.index(target, start + 1) + add
            yield res