Exemplo n.º 1
0
 def test_add(self):
     '''Test add'''
     h = histogram.Histogram(3)
     h.add(4)
     self.assertEqual({3: 1}, h.bins)
     h.add(4)
     self.assertEqual({3: 2}, h.bins)
     h.add(42)
     self.assertEqual({3: 2, 42: 1}, h.bins)
Exemplo n.º 2
0
    def _load_minimap_insert_histogram(cls, infile, bin_size):
        hist = histogram.Histogram(bin_size)

        with open(infile) as f:
            for line in f:
                value, count = line.rstrip().split('\t')
                hist.add(int(value), count=int(count))

        return hist
Exemplo n.º 3
0
 def test_len(self):
     '''Test __len__'''
     h = histogram.Histogram(1)
     self.assertEqual(0, len(h))
     h.add(1)
     self.assertEqual(1, len(h))
     h.add(1)
     self.assertEqual(2, len(h))
     h.add(2)
     self.assertEqual(3, len(h))
Exemplo n.º 4
0
 def test_stats(self):
     '''Test stats'''
     h = histogram.Histogram(1)
     for i in range(10):
         h.add(i + 1)
     h.bins[3] = 3
     h.bins[4] = 3
     h.bins[5] = 5
     h.bins[6] = 3
     h.bins[7] = 2
     h.bins[8] = 2
     self.assertEqual((2.5, 5.5, 10.5, 0.91), h.stats())
Exemplo n.º 5
0
    def test_to_bin(self):
        '''Test _to_bin'''
        h = histogram.Histogram(3)
        tests = [
            (0, 0),
            (1, 0),
            (2, 0),
            (3, 3),
            (4, 3),
            (5, 3),
            (6, 6),
        ]

        for t in tests:
            self.assertEqual(h._to_bin(t[0]), t[1])
Exemplo n.º 6
0
 def test_load_minimap_insert_histogram(self):
     '''test _load_minimap_insert_histogram'''
     infile = os.path.join(
         data_dir, 'clusters_test_load_minimap_insert_histogram.in')
     bin_size = 10
     got = clusters.Clusters._load_minimap_insert_histogram(
         infile, bin_size)
     expected = histogram.Histogram(bin_size)
     expected.add(85, count=1)
     expected.add(86, count=2)
     expected.add(90, count=4)
     expected.add(91, count=6)
     expected.add(97, count=10)
     expected.add(100, count=7)
     expected.add(111, count=3)
     self.assertEqual(expected, got)
Exemplo n.º 7
0
    def __init__(
        self,
        refdata_dir,
        reads_1,
        reads_2,
        outdir,
        extern_progs,
        version_report_lines=None,
        assembly_kmer=21,
        assembly_coverage=100,
        threads=1,
        verbose=False,
        assembler='fermilite',
        spades_mode='rna',
        spades_options=None,
        max_insert=1000,
        min_scaff_depth=10,
        nucmer_min_id=90,
        nucmer_min_len=20,
        nucmer_breaklen=200,
        assembled_threshold=0.95,
        unique_threshold=0.03,
        max_gene_nt_extend=30,
        clean=True,
        tmp_dir=None,
    ):
        self.refdata_dir = os.path.abspath(refdata_dir)
        self.refdata, self.cluster_ids = self._load_reference_data_from_dir(
            refdata_dir)
        self.reads_1 = os.path.abspath(reads_1)
        self.reads_2 = os.path.abspath(reads_2)
        self.outdir = os.path.abspath(outdir)
        self.extern_progs = extern_progs
        self.clusters_tsv = os.path.abspath(
            os.path.join(refdata_dir, '02.cdhit.clusters.tsv'))
        self.all_ref_seqs_fasta = os.path.abspath(
            os.path.join(refdata_dir, '02.cdhit.all.fa'))

        if version_report_lines is None:
            self.version_report_lines = []
        else:
            self.version_report_lines = version_report_lines

        self.clean = clean
        self.logs_dir = os.path.join(self.outdir, 'Logs')

        self.assembler = assembler
        self.assembly_kmer = assembly_kmer
        self.assembly_coverage = assembly_coverage
        self.spades_mode = spades_mode
        self.spades_options = spades_options

        self.cdhit_files_prefix = os.path.join(self.refdata_dir, 'cdhit')
        self.cdhit_cluster_representatives_fa = self.cdhit_files_prefix + '.cluster_representatives.fa'
        self.bam_prefix = os.path.join(self.outdir,
                                       'map_reads_to_cluster_reps')
        self.bam = self.bam_prefix + '.bam'
        self.report_file_all_tsv = os.path.join(self.outdir,
                                                'debug.report.tsv')
        self.report_file_filtered = os.path.join(self.outdir, 'report.tsv')
        self.mlst_reports_prefix = os.path.join(self.outdir, 'mlst_report')
        self.mlst_profile_file = os.path.join(self.refdata_dir,
                                              'pubmlst.profile.txt')
        self.catted_assembled_seqs_fasta = os.path.join(
            self.outdir, 'assembled_seqs.fa.gz')
        self.catted_genes_matching_refs_fasta = os.path.join(
            self.outdir, 'assembled_genes.fa.gz')
        self.catted_assemblies_fasta = os.path.join(self.outdir,
                                                    'assemblies.fa.gz')
        self.threads = threads
        self.verbose = verbose

        self.max_insert = max_insert

        self.insert_hist_bin = 10
        self.insert_hist = histogram.Histogram(self.insert_hist_bin)
        self.insert_size = None
        self.insert_sspace_sd = None
        self.insert_proper_pair_max = None

        self.min_scaff_depth = min_scaff_depth
        self.nucmer_min_id = nucmer_min_id
        self.nucmer_min_len = nucmer_min_len
        self.nucmer_breaklen = nucmer_breaklen

        self.assembled_threshold = assembled_threshold
        self.unique_threshold = unique_threshold
        self.max_gene_nt_extend = max_gene_nt_extend

        self.cluster_to_dir = {}  # gene name -> abs path of cluster directory
        self.clusters = {}  # gene name -> Cluster object
        self.cluster_read_counts = {}  # gene name -> number of reads
        self.cluster_base_counts = {}  # gene name -> number of bases
        self.pool = None
        self.fails_dir = os.path.join(self.outdir, '.fails')
        self.clusters_all_ran_ok = True

        for d in [self.outdir, self.logs_dir, self.fails_dir]:
            try:
                os.mkdir(d)
            except:
                raise Error('Error mkdir ' + d)
        if tmp_dir is None:
            if 'ARIBA_TMPDIR' in os.environ:
                tmp_dir = os.path.abspath(os.environ['ARIBA_TMPDIR'])
            elif 'TMPDIR' in os.environ:
                tmp_dir = os.path.abspath(os.environ['TMPDIR'])
            else:
                tmp_dir = self.outdir

        if not os.path.exists(tmp_dir):
            raise Error('Temporary directory ' + tmp_dir +
                        ' not found. Cannot continue')

        if self.clean:
            self.tmp_dir_obj = tempfile.TemporaryDirectory(
                prefix='ariba.tmp.', dir=os.path.abspath(tmp_dir))
            self.tmp_dir = self.tmp_dir_obj.name
        else:
            self.tmp_dir_obj = None
            self.tmp_dir = os.path.join(self.outdir, 'clusters')
            try:
                os.mkdir(self.tmp_dir)
            except:
                raise Error('Error making directory ' + self.tmp_dir)

        if self.verbose:
            print('Temporary directory:', self.tmp_dir)

        for i in [
                x for x in dir(signal)
                if x.startswith("SIG") and x not in {'SIGCHLD', 'SIGCLD'}
        ]:
            try:
                signum = getattr(signal, i)
                signal.signal(signum, self._receive_signal)
            except:
                pass
Exemplo n.º 8
0
    def __init__(self,
      db_fasta,
      reads_1,
      reads_2,
      outdir,
      assembly_kmer=21,
      threads=1,
      verbose=False,
      assembler='velvet',
      smalt_k=13,
      smalt_s=2,
      smalt_min_id=0.9,
      spades_other=None,
      max_insert=1000,
      min_scaff_depth=10,
      nucmer_min_id=90,
      nucmer_min_len=50,
      nucmer_breaklen=50,
      assembled_threshold=0.95,
      unique_threshold=0.03,
      bcftools_exe='bcftools',
      gapfiller_exe='GapFiller.pl',
      samtools_exe='samtools',
      smalt_exe='smalt',
      bowtie2_exe='bowtie2',
      bowtie2_preset='very-sensitive-local',
      spades_exe='spades.py',
      sspace_exe='SSPACE_Basic_v2.0.pl',
      velvet_exe='velvet', # prefix of velvet{g,h}
      cdhit_seq_identity_threshold=0.9,
      cdhit_length_diff_cutoff=0.9,
      run_cd_hit=True,
      clean=1,
    ):
        self.db_fasta = os.path.abspath(db_fasta)
        self.reads_1 = os.path.abspath(reads_1)
        self.reads_2 = os.path.abspath(reads_2)
        self.outdir = os.path.abspath(outdir)
        self.clusters_outdir = os.path.join(self.outdir, 'Clusters')
        self.clusters_info_file = os.path.join(self.outdir, 'clusters.tsv')
        self.clean = clean

        self.assembler = assembler
        assert self.assembler in ['velvet', 'spades']
        self.assembly_kmer = assembly_kmer
        self.spades_other = spades_other

        self.db_fasta_clustered = os.path.join(self.outdir, 'genes.clustered.fa')
        self.cluster_ids = {}
        self.bam_prefix = os.path.join(self.outdir, 'map_all_reads')
        self.bam = self.bam_prefix + '.bam'
        self.report_file_tsv = os.path.join(self.outdir, 'report.tsv')
        self.report_file_xls = os.path.join(self.outdir, 'report.xls')
        self.threads = threads
        self.verbose = verbose

        self.smalt_k = smalt_k
        self.smalt_s = smalt_s
        self.smalt_min_id = smalt_min_id
        self.max_insert = max_insert
        self.smalt_exe = smalt_exe
        self.bowtie2_exe = bowtie2_exe
        self.bowtie2_preset = bowtie2_preset

        self.insert_hist_bin = 10
        self.insert_hist = histogram.Histogram(self.insert_hist_bin)
        self.insert_size = None
        self.insert_sspace_sd = None
        self.insert_proper_pair_max = None

        self.min_scaff_depth = min_scaff_depth
        self.nucmer_min_id = nucmer_min_id
        self.nucmer_min_len = nucmer_min_len
        self.nucmer_breaklen = nucmer_breaklen

        self.assembled_threshold = assembled_threshold
        self.unique_threshold = unique_threshold

        self.cluster_to_dir = {}  # gene name -> abs path of cluster directory
        self.clusters = {}        # gene name -> Cluster object

        self.bcftools_exe = bcftools_exe

        self.sspace_exe = shutil.which(sspace_exe)
        if self.sspace_exe is None:
            print('WARNING: SSPACE not found. Scaffolding and gap filling will be skipped!', file=sys.stderr)
            self.gapfiller_exe = None
        else:
            self.sspace_exe = os.path.realpath(self.sspace_exe) # otherwise sspace dies loading packages
            self.gapfiller_exe = shutil.which(gapfiller_exe)
            if self.gapfiller_exe is None:
                print('WARNING: GapFiller not found. No gap filling will be run after scaffolding!', file=sys.stderr)
            else:
                self.gapfiller_exe = os.path.realpath(self.gapfiller_exe) # otherwise gapfiller dies loading packages

        self.samtools_exe = samtools_exe
        self.spades_exe = spades_exe

        self.velvet = velvet_exe

        self.cdhit_seq_identity_threshold = cdhit_seq_identity_threshold
        self.cdhit_length_diff_cutoff = cdhit_length_diff_cutoff
        self.run_cd_hit = run_cd_hit

        for d in [self.outdir, self.clusters_outdir]:
            try:
                os.mkdir(d)
            except:
                raise Error('Error mkdir ' + d)