Пример #1
0
    def test_run_advanced(self):

        sultan = Sultan()
        try:
            sultan.mkdir("-p /tmp/mytestdir")\
                .mkdir("-p /tmp/mytestdir/foobar")\
                .touch("/tmp/mytestdir/a")\
                .touch("/tmp/mytestdir/b")\
                .run()

            response = sultan.ls("-1 /tmp/mytestdir/").run()
            self.assertEqual(response, ['a', 'b', 'foobar'])
        finally:
            if os.path.exists('/tmp/mytestdir'):
                shutil.rmtree('/tmp/mytestdir')
def generate_profiles(in_dataframe, out_path):
    """Rather complicated and quite honetly ugly looking function used
    for generating the profiles from a given set of sequences. Intended to be used internally.
    """
    out_path = Path(out_path)
    dataset = in_dataframe
    s = Sultan()

    print('Unpacking and generating Uniprot DB.')
    s.gunzip('-fk ../data/swiss-prot/uniprot_sprot.fasta.gz').run()
    cmd = NcbimakeblastdbCommandline(
        input_file='../data/swiss-prot/uniprot_sprot.fasta', dbtype='prot')
    cmd()
    if not (out_path / 'profile').exists():
        s.mkdir(out_path / 'profile').run()

    with TemporaryDirectory() as psi_temp:
        for _, sample in tqdm(dataset.iterrows(),
                              total=len(dataset),
                              desc='Generating profiles'):
            with NamedTemporaryFile(mode='w') as blast_in:
                if isinstance(sample.name, tuple):
                    sample_id, chain = sample.name[0], sample.name[1]
                    out_name = f'{sample_id}_{chain}'
                    dump_path = out_path / 'full_test_summary.joblib'
                else:
                    sample_id = sample.name
                    out_name = sample_id
                    dump_path = out_path / 'jpred_summary.joblib'

                sequence, structure = sample[['Sequence', 'Structure']]
                structure = ' ' + structure
                print(f'>{out_name}', file=blast_in)
                print(sequence, file=blast_in)
                blast_in.seek(0)
                cmd = NcbipsiblastCommandline(
                    query=blast_in.name,
                    db='../data/swiss-prot/uniprot_sprot.fasta',
                    evalue=0.01,
                    num_iterations=3,
                    out_ascii_pssm=f'{psi_temp}/{out_name}.pssm',
                    num_descriptions=10000,
                    num_alignments=10000,
                    #  out=f'{psi_temp}{out_name}.alns.blast',
                    num_threads=8)
                cmd()

                if not os.path.exists(
                        os.path.join(psi_temp, out_name + '.pssm')):
                    tqdm.write(
                        f'Unable to generate profile for {out_name}. No hits in the database.'
                    )
                    dataset.drop(index=sample.name, inplace=True)
                    continue
                with open(f'{psi_temp}/{out_name}.pssm', 'r') as pssm_file:
                    pssm_file.readline()
                    pssm_file.readline()
                    profile = []
                    offset = False
                    position = 0
                    for line in pssm_file:
                        line = line.rstrip()
                        if not line:
                            break
                        line = line.split()
                        line.append(structure[position])
                        position += 1
                        if not offset:
                            for i in range(2):
                                line.insert(0, '')
                                offset = True
                        profile.append(line)
                    profile = pd.DataFrame(profile)
                    profile.drop(
                        (profile.columns[col] for col in range(2, 22)),
                        axis=1,
                        inplace=True)
                    profile.drop((profile.columns[-3:-1]),
                                 axis=1,
                                 inplace=True)
                    profile.drop((profile.columns[0]), axis=1, inplace=True)
                    profile.columns = profile.iloc[0]
                    profile = profile[1:]
                    profile.rename(columns={profile.columns[0]: "Sequence"},
                                   inplace=True)
                    profile.rename(columns={profile.columns[-1]: "Structure"},
                                   inplace=True)
                    profile = profile[
                        ['Structure'] +
                        [col for col in profile.columns if col != 'Structure']]
                    profile.loc[:, 'A':'V'] = profile.loc[:, 'A':'V'].astype(
                        float).divide(100)
                    profile.to_csv(out_path / 'profile' /
                                   (out_name + '.profile'),
                                   sep='\t',
                                   index=False)
    print(
        f'Dumping clean test to {dump_path}. Profiles are generated in {out_path}/profile'
    )
    dump(dataset, dump_path)