Пример #1
0
def align_bowtie2(task):
    for ref_order in range(1, task.ref_num + 1):
        logger.info('Running Bowtie2 alignment for ref #%d.' % ref_order)
        aligner_cwd = task.path.joinpath(task.id, 'alignment', 'bowtie2')
        ref_index_path = str(
            aligner_cwd.joinpath('%s_ref_%d' % (task.id, ref_order)))
        if task.dehost != None:
            filterd_R1 = str(
                task.path.joinpath(task.id, 'reads',
                                   task.id + '_host_removed_R1.fastq.gz'))
            filterd_R2 = str(
                task.path.joinpath(task.id, 'reads',
                                   task.id + '_host_removed_R2.fastq.gz'))
        else:
            filterd_R1 = str(
                task.path.joinpath(task.id, 'reads', task.id + '_R1.fastq.gz'))
            filterd_R2 = str(
                task.path.joinpath(task.id, 'reads', task.id + '_R2.fastq.gz'))
        reads_cmd = ['-1', filterd_R1, '-2', filterd_R2]
        thread_cmd = ['-p', str(task.threads)]
        output_cmd = ['-S', '%s_ref_%d.sam' % (task.id, ref_order)]
        other_cmd = [
            '--very-sensitive-local', '--un-conc-gz',
            '%s' % ('%s_ref_%d_unmapped_R%%.fastq.gz' % (task.id, ref_order))
        ]
        aln_cmd = ['bowtie2', '-x', ref_index_path
                   ] + reads_cmd + output_cmd + thread_cmd + other_cmd
        logger.info('CMD: ' + ' '.join(aln_cmd))
        utils.write_log_file(task.path.joinpath(task.id),
                             'CMD: ' + ' '.join(aln_cmd))
        bt2_run = subprocess.run(aln_cmd, cwd=aligner_cwd, capture_output=True)
        print(bt2_run.stdout.decode(encoding='utf-8'))
        print(bt2_run.stderr.decode(encoding='utf-8'))
Пример #2
0
def run_fastp(task):
    logger.info('Running fastp to filter reads.')
    original_R1 = task.path.joinpath(task.id, 'reads', 'original',
                                     task.id + '_R1.fastq.gz')
    original_R2 = task.path.joinpath(task.id, 'reads', 'original',
                                     task.id + '_R2.fastq.gz')
    filterd_R1 = task.path.joinpath(task.id, 'reads', task.id + '_R1.fastq.gz')
    filterd_R2 = task.path.joinpath(task.id, 'reads', task.id + '_R2.fastq.gz')
    report_json = task.path.joinpath(task.id, 'reads', 'fastp.json')
    report_html = task.path.joinpath(task.id, 'reads', 'fastp.html')
    fastp_cmd = [
        "fastp", "-i",
        str(original_R1), "-I",
        str(original_R2), "-o",
        str(filterd_R1), "-O",
        str(filterd_R2)
    ]
    reports_cmd = ['-j', str(report_json), '-h', str(report_html)]
    parameter_cmd = [
        '-f',
        str(task.global_trimming), '-t',
        str(task.global_trimming), '-F',
        str(task.global_trimming), '-T',
        str(task.global_trimming), '-w',
        str(task.threads)
    ]
    logger.info('CMD: ' + ' '.join(fastp_cmd + reports_cmd + parameter_cmd))
    utils.write_log_file(
        task.path.joinpath(task.id),
        'CMD: ' + ' '.join(fastp_cmd + reports_cmd + parameter_cmd))
    fastp_run = subprocess.run(fastp_cmd + reports_cmd + parameter_cmd,
                               capture_output=True)
    print(fastp_run.stderr.decode(encoding='utf-8'))
Пример #3
0
def align_flagstat(task, aligners):
    stats_dict = {'mapped_rate': {}}
    for aligner in aligners:
        logger.info('Analysis BAM file from %s' % aligner)
        aligner_cwd = task.path.joinpath(task.id, 'alignment', aligner)
        stats_dict['mapped_rate'][aligner] = {}
        for ref_order in range(1, task.ref_num + 1):
            flagstat_cmd = [
                'samtools', 'flagstat', '-@', task.threads,
                '%s_ref_%d.sorted.bam' % (task.id, ref_order)
            ]
            logger.info('CMD: ' + ' '.join(flagstat_cmd))
            utils.write_log_file(task.path.joinpath(task.id),
                                 'CMD: ' + ' '.join(flagstat_cmd))
            flagstat_run = subprocess.run(flagstat_cmd,
                                          cwd=aligner_cwd,
                                          capture_output=True)
            stats_text = flagstat_run.stdout.decode(encoding='utf-8')
            stats_list = stats_text.split('\n')
            utils.build_text_file(
                task.path.joinpath(aligner_cwd,
                                   'flagstat_ref_%d.txt' % ref_order),
                stats_text)
            mapped_rate = stats_list[4].split(' ')[4][1:]
            stats_dict['mapped_rate'][aligner][ref_order] = mapped_rate
    utils.build_json_file(
        task.path.joinpath(task.id, 'alignment', 'flagstat.json'), stats_dict)
Пример #4
0
def bam_sort_n_index(task, aligner):
    for ref_order in range(1, task.ref_num + 1):
        logger.info('Sorting & indexing BAM file for aln #%d.' % ref_order)
        aligner_cwd = task.path.joinpath(task.id, 'alignment', aligner)
        # sorting
        sorting_cmd = [
            'samtools', 'sort', '-@', task.threads,
            '%s_ref_%d.sam' % (task.id, ref_order), '-o',
            '%s_ref_%d.sorted.bam' % (task.id, ref_order)
        ]
        logger.info('CMD: ' + ' '.join(sorting_cmd))
        utils.write_log_file(task.path.joinpath(task.id),
                             'CMD: ' + ' '.join(sorting_cmd))
        sorting_run = subprocess.run(sorting_cmd,
                                     cwd=aligner_cwd,
                                     capture_output=True)
        print(sorting_run.stdout.decode(encoding='utf-8'))
        print(sorting_run.stderr.decode(encoding='utf-8'))
        # indexing
        indexing_cmd = [
            'samtools', 'index', '-@', task.threads,
            '%s_ref_%d.sorted.bam' % (task.id, ref_order)
        ]
        logger.info('CMD: ' + ' '.join(indexing_cmd))
        utils.write_log_file(task.path.joinpath(task.id),
                             'CMD: ' + ' '.join(indexing_cmd))
        index_run = subprocess.run(indexing_cmd,
                                   cwd=aligner_cwd,
                                   capture_output=True)
        print(index_run.stdout.decode(encoding='utf-8'))
        print(index_run.stderr.decode(encoding='utf-8'))
        # remove sam file to release disk space
        os.remove(aligner_cwd.joinpath('%s_ref_%d.sam' % (task.id, ref_order)))
Пример #5
0
def ref_index(task, aligner):
    for ref_order in range(1, task.ref_num + 1):
        logger.info('Building Bowtie2 ref index for ref #%d.' % ref_order)
        aligner_cwd = task.path.joinpath(task.id, 'alignment', aligner)
        ref_fasta_path = task.path.joinpath(
            task.id, 'reference', '%s_ref_%d.fasta' % (task.id, ref_order))
        Path.mkdir(aligner_cwd, parents=True, exist_ok=True)
        shutil.copy2(ref_fasta_path, aligner_cwd)
        if aligner == 'bowtie2':
            index_cmd = [
                'bowtie2-build', '--threads', task.threads,
                '%s_ref_%d.fasta' % (task.id, ref_order),
                '%s_ref_%d' % (task.id, ref_order)
            ]
        elif aligner == 'bwa':
            index_cmd = [
                'bwa', 'index', '-p',
                '%s_ref_%d' % (task.id, ref_order),
                '%s_ref_%d.fasta' % (task.id, ref_order)
            ]
        logger.info('CMD: ' + ' '.join(index_cmd))
        utils.write_log_file(task.path.joinpath(task.id),
                             'CMD: ' + ' '.join(index_cmd))
        ref_index_run = subprocess.run(index_cmd,
                                       cwd=aligner_cwd,
                                       capture_output=True)
        print(ref_index_run.stdout.decode(encoding='utf-8'))
        print(ref_index_run.stderr.decode(encoding='utf-8'))
Пример #6
0
 def get_total_graphs(self):
     time_1 = datetime.now()
     
     for graph_idx in tqdm(self.graph_id_list):
         data = torch.load(os.path.join(self.data_processed_path, '{}_{}.pt'.format(self.name, graph_idx)))
         self.total_graph[graph_idx] = data
     write_log_file(self.log_path, "load and append {} graph, time = {}".format(len(self.graph_id_list), datetime.now() - time_1))
Пример #7
0
def align_bwa(task):
    for ref_order in range(1, task.ref_num + 1):
        logger.info('Running BWA alignment for ref #%d.' % ref_order)
        aligner_cwd = task.path.joinpath(task.id, 'alignment', 'bwa')
        ref_index_path = str(
            aligner_cwd.joinpath('%s_ref_%d' % (task.id, ref_order)))
        if task.dehost != None:
            filterd_R1 = str(
                task.path.joinpath(task.id, 'reads',
                                   task.id + '_host_removed_R1.fastq.gz'))
            filterd_R2 = str(
                task.path.joinpath(task.id, 'reads',
                                   task.id + '_host_removed_R2.fastq.gz'))
        else:
            filterd_R1 = str(
                task.path.joinpath(task.id, 'reads', task.id + '_R1.fastq.gz'))
            filterd_R2 = str(
                task.path.joinpath(task.id, 'reads', task.id + '_R2.fastq.gz'))
        reads_cmd = [filterd_R1, filterd_R2]
        thread_cmd = ['-t', str(task.threads)]
        output_cmd = ['-o', '%s_ref_%d.sam' % (task.id, ref_order)]
        aln_cmd = ['bwa', 'mem'] + thread_cmd + [ref_index_path
                                                 ] + reads_cmd + output_cmd
        logger.info('CMD: ' + ' '.join(aln_cmd))
        utils.write_log_file(task.path.joinpath(task.id),
                             'CMD: ' + ' '.join(aln_cmd))
        bt2_run = subprocess.run(aln_cmd, cwd=aligner_cwd, capture_output=True)
        print(bt2_run.stdout.decode(encoding='utf-8'))
        print(bt2_run.stderr.decode(encoding='utf-8'))
Пример #8
0
def align_coverage_stat(task, aligners):
    cov_dict = {}
    for aligner in aligners:
        cov_dict[aligner] = {}
        logger.info('Analysis coverage stats from %s BAM files.' % aligner)
        aligner_cwd = task.path.joinpath(task.id, 'alignment', aligner)
        for ref_order in range(1, task.ref_num + 1):
            cov_dict[aligner][ref_order] = {}
            flagstat_cmd = [
                'samtools', 'coverage',
                '%s_ref_%d.sorted.bam' % (task.id, ref_order)
            ]
            logger.info('CMD: ' + ' '.join(flagstat_cmd))
            utils.write_log_file(task.path.joinpath(task.id),
                                 'CMD: ' + ' '.join(flagstat_cmd))
            flagstat_run = subprocess.run(flagstat_cmd,
                                          cwd=aligner_cwd,
                                          capture_output=True)
            stats_text = flagstat_run.stdout.decode(encoding='utf-8')
            titles = stats_text.split('\n')[0].split('\t')
            stats = stats_text.split('\n')[1].split('\t')
            for i in range(len(titles)):
                cov_dict[aligner][ref_order][titles[i]] = stats[i]
    utils.build_json_file(
        task.path.joinpath(task.id, 'alignment', 'coverage_stat.json'),
        cov_dict)
Пример #9
0
def variant_calling_lofreq(task):
    logger.info('Starting variant calling by LoFreq.')
    thread_cmd = ['call-parallel', '--pp-threads', str(task.threads)]
    other_cmd = [
        '--call-indels', '-N', '-B', '-q', '20', '-Q', '20', '-m', '20'
    ]

    for aligner in task.alns:
        logger.info('Running VC for %s output.' % aligner)
        aln_data_cwd = task.path.joinpath(task.id, 'alignment', aligner)
        for ref_order in range(1, task.ref_num + 1):
            aln_input_name = '%s_ref_%d.sorted.bam' % (task.id, ref_order)
            aln_indelqual_name = '%s_ref_%d.indelqual.sorted.bam' % (task.id,
                                                                     ref_order)
            ref_name = '%s_ref_%d.fasta' % (task.id, ref_order)
            # index ref
            faidx_cmd = ['lofreq', 'faidx', ref_name]
            logger.info('CMD: ' + ' '.join(faidx_cmd))
            utils.write_log_file(task.path.joinpath(task.id),
                                 'CMD: ' + ' '.join(faidx_cmd))
            faidx_run = subprocess.run(faidx_cmd,
                                       cwd=aln_data_cwd,
                                       capture_output=True)
            print(faidx_run.stdout.decode(encoding='utf-8'))
            print(faidx_run.stderr.decode(encoding='utf-8'))
            # indelqual
            indelqual_cmd = [
                'lofreq', 'indelqual', '--dindel', '--ref', ref_name, '--out',
                aln_indelqual_name, aln_input_name
            ]
            indelqual_run = subprocess.run(indelqual_cmd,
                                           cwd=aln_data_cwd,
                                           capture_output=True)
            print(indelqual_run.stdout.decode(encoding='utf-8'))
            print(indelqual_run.stderr.decode(encoding='utf-8'))
            # index indelqual-ed BAM
            indelqual_index_cmd = ['samtools', 'index', aln_indelqual_name]
            indelqual_index_run = subprocess.run(indelqual_index_cmd,
                                                 cwd=aln_data_cwd,
                                                 capture_output=True)
            print(indelqual_index_run.stdout.decode(encoding='utf-8'))
            print(indelqual_index_run.stderr.decode(encoding='utf-8'))
            # vc
            ref_cmd = ['-f', ref_name]
            output_cmd = [
                '-o',
                '%s_%s_ref_%d_lofreq.vcf' % (task.id, aligner, ref_order)
            ]
            vc_cmd = ['lofreq'] + thread_cmd + \
                ref_cmd + output_cmd + \
                other_cmd + [aln_indelqual_name]
            logger.info('CMD: ' + ' '.join(vc_cmd))
            utils.write_log_file(task.path.joinpath(task.id),
                                 'CMD: ' + ' '.join(vc_cmd))
            vc_run = subprocess.run(vc_cmd,
                                    cwd=aln_data_cwd,
                                    capture_output=True)
            print(vc_run.stdout.decode(encoding='utf-8'))
            print(vc_run.stderr.decode(encoding='utf-8'))
Пример #10
0
    def test(self, iter_no):
        write_log_file(self.log_path, "Start to testing ...")
        test_query_ids = self.text_data.split_ids['test']
        success = {1: 0, 5: 0, 10: 0}
        total_test_scores = []
        test_start = datetime.now()
        for test_chunk in chunk(test_query_ids, 100):
            one_chunk_scores = []
            for i, query_id in enumerate(test_chunk):
                rank_ids, one_row_scores = self.retrieve_rank(
                    query_id, test_chunk, self.text_data, self.code_data)
                one_chunk_scores.append(one_row_scores)
                for k in success.keys():
                    if query_id in rank_ids[:k]:
                        success[k] += 1
            total_test_scores.append(one_chunk_scores)

        write_log_file(
            self.log_path,
            "\n&Testing Iteration {}: for {} queries finished. Time elapsed = {}."
            .format(iter_no, len(test_query_ids),
                    datetime.now() - test_start))

        all_mrr = []
        for i in range(len(total_test_scores)):
            one_chunk_square_score = total_test_scores[i]
            one_chunk_square_score = np.vstack(one_chunk_square_score)
            assert one_chunk_square_score.shape[
                0] == one_chunk_square_score.shape[
                    1], "Every Chunk must be square"
            mrr_array = self.calculate_square_mrr(one_chunk_square_score)
            all_mrr.extend(mrr_array)
        mrr = np.array(all_mrr).mean()
        self.test_iter.append(iter_no)
        self.test_mrr.append(mrr)
        write_log_file(
            self.log_path,
            "&Testing Iteration {}: MRR = &{}&".format(iter_no, mrr))

        for k, v in success.items():
            value = v * 1.0 / len(test_query_ids)
            write_log_file(
                self.log_path, "&Testing Iteration {}: S@{}@ = &{}&".format(
                    iter_no, k, value))
            if k == 1:
                self.test_s1.append(value)
            elif k == 5:
                self.test_s5.append(value)
            elif k == 10:
                self.test_s10.append(value)
            else:
                print('cannot find !')
        write_log_file(
            self.log_path,
            "S@1, S@5, S@10\n{}, {}, {}".format(self.test_s1[-1],
                                                self.test_s5[-1],
                                                self.test_s10[-1]))
Пример #11
0
    def __init__(self, args):
        self.args = args
        self.dataset_dir = args.data_dir

        if self.args.only_test:
            self.sig = os.path.join(
                args.log_dir,
                "OnlyText_" + datetime.now().strftime("%Y-%m-%d@%H:%M:%S"))
        else:
            self.sig = os.path.join(
                args.log_dir,
                datetime.now().strftime("%Y-%m-%d@%H:%M:%S"))
        os.mkdir(self.sig)

        self.log_path = os.path.join(self.sig,
                                     'log_{}.txt'.format(args_file_name))
        self.best_model_path = os.path.join(self.sig, 'best_model.pt')

        table_draw = arguments_to_tables(args=arguments)
        write_log_file(self.log_path, str(table_draw))

        self.train_batch_size = args.train_batch_size
        self.valid_batch_size = args.valid_batch_size
        self.max_iteration = args.max_iter
        self.margin = args.margin

        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        write_log_file(self.log_path, "\n****CPU or GPU: " + str(self.device))

        max_number_edge_types = 3

        if self.args.conv.lower() in ['rgcn', 'cg', 'nnconv']:
            self.model = GraphMatchNetwork(
                node_init_dims=300,
                arguments=args,
                device=self.device,
                max_number_of_edges=max_number_edge_types).to(self.device)
        else:
            raise NotImplementedError

        write_log_file(self.log_path, str(self.model))
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=args.lr)

        write_log_file(self.log_path, "Init Reading Code Graphs ... ")
        self.code_data = ProcessedDataset(name='code',
                                          root=self.dataset_dir,
                                          log_path=self.log_path)
        write_log_file(self.log_path, "Init Reading Text Graphs ... ")
        self.text_data = ProcessedDataset(name='text',
                                          root=self.dataset_dir,
                                          log_path=self.log_path)

        # for plotting and record (init empty list)
        self.train_iter, self.train_smooth_loss, self.valid_iter, self.valid_loss, self.test_iter, self.test_mrr, self.test_s1, self.test_s5, self.test_s10 = (
            [] for _ in range(9))
Пример #12
0
 def __init__(self, name, root, log_path):
     self.name = name
     self.data_processed_path = os.path.join(root, '{}_processed'.format(self.name))
     self.graph_id_file = os.path.join(root, '{}_graph_ids.pt'.format(self.name))
     self.graph_id_list = torch.load(self.graph_id_file)
     self.log_path = log_path
     
     self._check_whether_all_graph_ids_files_exist()
     self.total_graph = {}
     # self.get_total_graphs()
     
     # Split train, test, validation set
     if os.path.exists(os.path.join(root, 'split.json')):
         with open(os.path.join(root, 'split.json'), 'rb') as f:
             self.split_ids = json.loads(f.read())
     else:
         raise NotImplementedError
     write_log_file(self.log_path, "Train={}\nValid={}\nTest={}".format(len(self.split_ids['train']), len(self.split_ids['valid']), len(self.split_ids['test'])))
Пример #13
0
def variant_calling_varscan2(task):
    logger.info('Starting variant calling by VarScan2.')
    mpileup_cmd = ['samtools', 'mpileup', '-B']
    mpileup2cns_cmd = ['varscan', 'mpileup2cns']
    output_cmd = ['--output-vcf', '1']
    other_cmd = ['--min-avg-qual', '20', '--P-value', '0.01']

    for aligner in task.alns:
        logger.info('Running VC for %s output.' % aligner)
        aln_data_cwd = task.path.joinpath(task.id, 'alignment', aligner)
        for ref_order in range(1, task.ref_num + 1):
            aln_input_cmd = [
                str(
                    aln_data_cwd.joinpath('%s_ref_%d.sorted.bam' %
                                          (task.id, ref_order)))
            ]
            ref_path = aln_data_cwd.joinpath('%s_ref_%d.fasta' %
                                             (task.id, ref_order))
            ref_cmd = ['-f', str(ref_path)]
            output_path = str(
                aln_data_cwd.joinpath('%s_%s_ref_%d_varscan.vcf' %
                                      (task.id, aligner, ref_order)))
            # Run samtools mpileup and pipe to varscan2
            samtools_cmd = mpileup_cmd + ref_cmd + aln_input_cmd
            logger.info('CMD: ' + ' '.join(samtools_cmd))
            utils.write_log_file(task.path.joinpath(task.id),
                                 'CMD: ' + ' '.join(samtools_cmd))
            samtools_run = subprocess.run(samtools_cmd,
                                          cwd=aln_data_cwd,
                                          capture_output=True)
            varscan2_cmd = mpileup2cns_cmd + other_cmd + output_cmd
            logger.info('CMD: ' + ' '.join(varscan2_cmd))
            utils.write_log_file(task.path.joinpath(task.id),
                                 'CMD: ' + ' '.join(varscan2_cmd))
            vc_run = subprocess.run(varscan2_cmd,
                                    cwd=aln_data_cwd,
                                    input=samtools_run.stdout,
                                    capture_output=True)
            utils.build_text_file(output_path,
                                  vc_run.stdout.decode(encoding='utf-8'))
            print(vc_run.stderr.decode(encoding='utf-8'))
Пример #14
0
    def fit(self):
        best_val_loss = 1e10
        all_loss = []
        code_train_batch = self.code_data.triple_train_batch(
            self.train_batch_size)
        time_1 = datetime.now()
        for iteration in range(self.max_iteration):
            self.model.train()
            # Compute similarity
            pos_code_graph_id_list, text_graph_id_list, neg_code_graph_id_list = next(
                code_train_batch)  # next for yield
            pos_code_batch = self.code_data.get_batch_graph(
                pos_code_graph_id_list)
            text_batch = self.text_data.get_batch_graph(text_graph_id_list)
            neg_code_batch = self.code_data.get_batch_graph(
                neg_code_graph_id_list)

            pos_pred = self.model(pos_code_batch,
                                  text_batch).reshape(-1, 1)  # [batch, 1]
            neg_pred = self.model(neg_code_batch, text_batch).reshape(-1, 1)

            loss = (self.margin - pos_pred + neg_pred).clamp(min=1e-6).mean()

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            all_loss.append(loss)
            # Print
            if iteration % self.args.print_interval == 0 and iteration > 0:
                self.train_iter.append(iteration)
                self.train_smooth_loss.append(
                    torch.tensor(all_loss).mean().cpu().detach())

                write_log_file(
                    self.log_path,
                    '@Train Iter {}: mean smooth loss = @{}@, time = {}.'.
                    format(iteration,
                           torch.tensor(all_loss).mean(),
                           datetime.now() - time_1))
                all_loss = []
                time_1 = datetime.now()
            # Validation
            if (iteration % self.args.valid_interval == 0
                    and iteration >= self.args.val_start) or iteration == 0:
                s_time = datetime.now()
                loss = self.validation()
                self.valid_iter.append(iteration)
                self.valid_loss.append(loss.cpu().detach())
                end_time = datetime.now()
                if loss < best_val_loss:
                    write_log_file(
                        self.log_path,
                        '#Valid Iter {}: loss = #{}# (Decrease) < Best loss = {}. Save to best model..., time elapsed = {}.'
                        .format(iteration, loss, best_val_loss,
                                end_time - s_time))
                    best_val_loss = loss
                    torch.save(self.model.state_dict(), self.best_model_path)
                else:
                    write_log_file(
                        self.log_path,
                        '#Valid Iter {}: loss = #{}# (Increase). Best val loss = {}, time elapsed = {}.'
                        .format(iteration, loss, best_val_loss,
                                end_time - s_time))
            # only testing when iteration == 0 (whether code is rightly run)
            if iteration == 0:
                self.test(iter_no=iteration)
def main_body():
    '''Main body of this file'''

    parser = argparse.ArgumentParser()

    # Configurations: read noisyspeech_synthesizer.cfg and gather inputs
    parser.add_argument(
        '--cfg',
        default='noisyspeech_synthesizer.cfg',
        help='Read noisyspeech_synthesizer.cfg for all the details')
    parser.add_argument('--cfg_str', type=str, default='noisy_speech')
    args = parser.parse_args()

    params = dict()
    params['args'] = args
    cfgpath = os.path.join(os.path.dirname(__file__), args.cfg)
    assert os.path.exists(cfgpath), f'No configuration file as [{cfgpath}]'

    cfg = CP.ConfigParser()
    cfg._interpolation = CP.ExtendedInterpolation()
    cfg.read(cfgpath)
    params['cfg'] = cfg._sections[args.cfg_str]
    cfg = params['cfg']

    clean_dir = os.path.join(os.path.dirname(__file__), 'CleanSpeech')
    if cfg['speech_dir'] != 'None':
        clean_dir = cfg['speech_dir']
    if not os.path.exists(clean_dir):
        assert False, ('Clean speech data is required')

    noise_dir = os.path.join(os.path.dirname(__file__), 'Noise')
    if cfg['noise_dir'] != 'None':
        noise_dir = cfg['noise_dir']
    if not os.path.exists:
        assert False, ('Noise data is required')

    params['fs'] = int(cfg['sampling_rate'])
    params['audioformat'] = cfg['audioformat']
    params['audio_length'] = float(cfg['audio_length'])
    params['silence_length'] = float(cfg['silence_length'])
    params['total_hours'] = float(cfg['total_hours'])

    if cfg['fileindex_start'] != 'None' and cfg['fileindex_start'] != 'None':
        params['num_files'] = int(cfg['fileindex_end']) - int(
            cfg['fileindex_start'])
        params['fileindex_start'] = int(cfg['fileindex_start'])
        params['fileindex_end'] = int(cfg['fileindex_end'])
    else:
        params['num_files'] = int(
            (params['total_hours'] * 60 * 60) / params['audio_length'])
        params['fileindex_start'] = 0
        params['fileindex_end'] = params['num_files']

    print('Number of files to be synthesized:', params['num_files'])

    params['is_test_set'] = utils.str2bool(cfg['is_test_set'])
    params['clean_activity_threshold'] = float(cfg['clean_activity_threshold'])
    params['noise_activity_threshold'] = float(cfg['noise_activity_threshold'])
    params['snr_lower'] = int(cfg['snr_lower'])
    params['snr_upper'] = int(cfg['snr_upper'])

    params['randomize_snr'] = utils.str2bool(cfg['randomize_snr'])
    params['target_level_lower'] = int(cfg['target_level_lower'])
    params['target_level_upper'] = int(cfg['target_level_upper'])

    if 'snr' in cfg.keys():
        params['snr'] = int(cfg['snr'])
    else:
        params['snr'] = int((params['snr_lower'] + params['snr_upper']) / 2)

    params['noisyspeech_dir'] = utils.get_dir(cfg, 'noisy_destination',
                                              'noisy')
    params['clean_proc_dir'] = utils.get_dir(cfg, 'clean_destination', 'clean')
    params['noise_proc_dir'] = utils.get_dir(cfg, 'noise_destination', 'noise')

    if 'speech_csv' in cfg.keys() and cfg['speech_csv'] != 'None':
        cleanfilenames = pd.read_csv(cfg['speech_csv'])
        cleanfilenames = cleanfilenames['filename']
    else:
        cleanfilenames = glob.glob(
            os.path.join(clean_dir, params['audioformat']))
    params['cleanfilenames'] = cleanfilenames
    shuffle(params['cleanfilenames'])
    params['num_cleanfiles'] = len(params['cleanfilenames'])
    # If there are .wav files in noise_dir directory, use those
    # If not, that implies that the noise files are organized into subdirectories by type,
    # so get the names of the non-excluded subdirectories
    if 'noise_csv' in cfg.keys() and cfg['noise_csv'] != 'None':
        noisefilenames = pd.read_csv(cfg['noise_csv'])
        noisefilenames = noisefilenames['filename']
    else:
        noisefilenames = glob.glob(
            os.path.join(noise_dir, params['audioformat']))

    if len(noisefilenames) != 0:
        shuffle(noisefilenames)
        params['noisefilenames'] = noisefilenames
    else:
        noisedirs = glob.glob(os.path.join(noise_dir, '*'))
        if cfg['noise_types_excluded'] != 'None':
            dirstoexclude = cfg['noise_types_excluded'].split(',')
            for dirs in dirstoexclude:
                noisedirs.remove(dirs)
        shuffle(noisedirs)
        params['noisedirs'] = noisedirs

    # Call main_gen() to generate audio
    clean_source_files, clean_clipped_files, clean_low_activity_files, \
    noise_source_files, noise_clipped_files, noise_low_activity_files = main_gen(params)

    # Create log directory if needed, and write log files of clipped and low activity files
    log_dir = utils.get_dir(cfg, 'log_dir', 'Logs')

    utils.write_log_file(log_dir, 'source_files.csv',
                         clean_source_files + noise_source_files)
    utils.write_log_file(log_dir, 'clipped_files.csv',
                         clean_clipped_files + noise_clipped_files)
    utils.write_log_file(log_dir, 'low_activity_files.csv', \
                         clean_low_activity_files + noise_low_activity_files)

    # Compute and print stats about percentange of clipped and low activity files
    total_clean = len(clean_source_files) + len(clean_clipped_files) + len(
        clean_low_activity_files)
    total_noise = len(noise_source_files) + len(noise_clipped_files) + len(
        noise_low_activity_files)
    pct_clean_clipped = round(len(clean_clipped_files) / total_clean * 100, 1)
    pct_noise_clipped = round(len(noise_clipped_files) / total_noise * 100, 1)
    pct_clean_low_activity = round(
        len(clean_low_activity_files) / total_clean * 100, 1)
    pct_noise_low_activity = round(
        len(noise_low_activity_files) / total_noise * 100, 1)

    print("Of the " + str(total_clean) + " clean speech files analyzed, " + \
          str(pct_clean_clipped) + "% had clipping, and " + str(pct_clean_low_activity) + \
          "% had low activity " + "(below " + str(params['clean_activity_threshold']*100) + \
          "% active percentage)")
    print("Of the " + str(total_noise) + " noise files analyzed, " + str(pct_noise_clipped) + \
          "% had clipping, and " + str(pct_noise_low_activity) + "% had low activity " + \
          "(below " + str(params['noise_activity_threshold']*100) + "% active percentage)")
def main_body():
    '''Main body of this file'''

    parser = argparse.ArgumentParser()

    # Configurations: read noisyspeech_synthesizer.cfg and gather inputs
    parser.add_argument('--cfg', default='noisyspeech_synthesizer.cfg',
                        help='Read noisyspeech_synthesizer.cfg for all the details')
    parser.add_argument('--cfg_str', type=str, default='noisy_speech')
    args = parser.parse_args()

    params = dict()
    params['args'] = args
    cfgpath = os.path.join(os.path.dirname(__file__), args.cfg)
    assert os.path.exists(cfgpath), f'No configuration file as [{cfgpath}]'

    cfg = CP.ConfigParser()
    cfg._interpolation = CP.ExtendedInterpolation()
    cfg.read(cfgpath)
    params['cfg'] = cfg._sections[args.cfg_str]
    cfg = params['cfg']

    clean_dir = os.path.join(os.path.dirname(__file__), 'CleanSpeech')
    if cfg['speech_dir'] != 'None':
        clean_dir = cfg['speech_dir']
    if not os.path.exists:
        assert False, ('Clean speech data is required')

    noise_dir = os.path.join(os.path.dirname(__file__), 'Noise')
    if cfg['noise_dir'] != 'None':
        noise_dir = cfg['noise_dir']
    if not os.path.exists:
        assert False, ('Noise data is required')

    params['fs'] = int(cfg['sampling_rate'])
    params['audioformat'] = cfg['audioformat']
    params['audio_length'] = float(cfg['audio_length'])
    params['silence_length'] = float(cfg['silence_length'])
    params['total_hours'] = float(cfg['total_hours'])
    
    if cfg['fileindex_start'] != 'None' and cfg['fileindex_start'] != 'None':
        params['fileindex_start'] = int(cfg['fileindex_start'])
        params['fileindex_end'] = int(cfg['fileindex_end'])    
        params['num_files'] = int(params['fileindex_end'])-int(params['fileindex_start'])
    else:
        params['num_files'] = int((params['total_hours']*60*60)/params['audio_length'])

    print('Number of files to be synthesized:', params['num_files'])
    params['is_test_set'] = utils.str2bool(cfg['is_test_set'])
    params['clean_activity_threshold'] = float(cfg['clean_activity_threshold'])
    params['noise_activity_threshold'] = float(cfg['noise_activity_threshold'])
    params['snr_lower'] = int(cfg['snr_lower'])
    params['snr_upper'] = int(cfg['snr_upper'])
    params['randomize_snr'] = utils.str2bool(cfg['randomize_snr'])
    params['target_level_lower'] = int(cfg['target_level_lower'])
    params['target_level_upper'] = int(cfg['target_level_upper'])
    
    if 'snr' in cfg.keys():
        params['snr'] = int(cfg['snr'])
    else:
        params['snr'] = int((params['snr_lower'] + params['snr_upper'])/2)

    params['noisyspeech_dir'] = utils.get_dir(cfg, 'noisy_destination', 'noisy')
    params['clean_proc_dir'] = utils.get_dir(cfg, 'clean_destination', 'clean')
    params['noise_proc_dir'] = utils.get_dir(cfg, 'noise_destination', 'noise')

    if 'speech_csv' in cfg.keys() and cfg['speech_csv'] != 'None':
        cleanfilenames = pd.read_csv(cfg['speech_csv'])
        cleanfilenames = cleanfilenames['filename']
    else:
        cleanfilenames = glob.glob(os.path.join(clean_dir, params['audioformat']))
    params['cleanfilenames'] = cleanfilenames
    shuffle(params['cleanfilenames'])
    params['num_cleanfiles'] = len(params['cleanfilenames'])

    params['noisefilenames'] = glob.glob(os.path.join(noise_dir, params['audioformat']))
    shuffle(params['noisefilenames'])

    # Invoke multiple processes and fan out calls to main_gen() to these processes
    global clean_counter, noise_counter
    clean_counter = multiprocessing.Value('i', 0)
    noise_counter = multiprocessing.Value('i', 0)    
    
    multi_pool = multiprocessing.Pool(processes=PROCESSES, initializer = init, initargs = (clean_counter, noise_counter, ))
    fileindices = range(params['num_files'])    
    output_lists = multi_pool.starmap(main_gen, zip(repeat(params), fileindices))

    flat_output_lists = []
    num_lists = 6
    for i in range(num_lists):
        flat_output_lists.append(extract_list(output_lists, i))

    # Create log directory if needed, and write log files of clipped and low activity files
    log_dir = utils.get_dir(cfg, 'log_dir', 'Logs')

    utils.write_log_file(log_dir, 'source_files.csv', flat_output_lists[0] + flat_output_lists[3])
    utils.write_log_file(log_dir, 'clipped_files.csv', flat_output_lists[1] + flat_output_lists[4])
    utils.write_log_file(log_dir, 'low_activity_files.csv', flat_output_lists[2] + flat_output_lists[5])
    
    # Compute and print stats about percentange of clipped and low activity files
    total_clean = len(flat_output_lists[0]) + len(flat_output_lists[1]) + len(flat_output_lists[2])
    total_noise = len(flat_output_lists[3]) + len(flat_output_lists[4]) + len(flat_output_lists[5])
    pct_clean_clipped = round(len(flat_output_lists[1])/total_clean*100, 1)
    pct_noise_clipped = round(len(flat_output_lists[4])/total_noise*100, 1)
    pct_clean_low_activity = round(len(flat_output_lists[2])/total_clean*100, 1)
    pct_noise_low_activity = round(len(flat_output_lists[5])/total_noise*100, 1)
    
    print("Of the " + str(total_clean) + " clean speech files analyzed, " + str(pct_clean_clipped) + \
          "% had clipping, and " + str(pct_clean_low_activity) + "% had low activity " + \
          "(below " + str(params['clean_activity_threshold']*100) + "% active percentage)")
    print("Of the " + str(total_noise) + " noise files analyzed, " + str(pct_noise_clipped) + \
          "% had clipping, and " + str(pct_noise_low_activity) + "% had low activity " + \
          "(below " + str(params['noise_activity_threshold']*100) + "% active percentage)")
Пример #17
0
def main():
    task = Task()
    task.conda_pkgs = [
        'fastp', 'samtools', 'bcftools',
        'bowtie2', 'bwa',
        'varscan', 'lofreq',
        'spades.py', 'blastn', 'makeblastdb'
        ]
    check_deps(task)
    task.path = Path.cwd().joinpath('tasks')
    task.name = args.prefix
    task.id = ''
    task.ref = args.ref
    task.with_ref = False
    task.ex_r1 = args.r1
    task.ex_r2 = args.r2
    task.threads = str(args.threads)
    task.alns = args.alns.split(',')
    task.global_trimming = str(args.trimming)
    task.dehost = args.remove_host
    task.spades_mem = str(args.spades_mem)
    task.spades_mode = args.spades_mode
    task.vc_threshold = '0.7'
    task.ref_num = 0
    task.min_vc_score = args.min_vc_score
    
    if args.test != None:
        task.name = 'test_run'
        task.ex_r1 = Path.cwd().joinpath('test_data','AdV_R1.fastq.gz')
        task.ex_r2 = Path.cwd().joinpath('test_data','AdV_R2.fastq.gz')
        if args.test == 'ref':
            task.ref = Path.cwd().joinpath('test_data', 'AC_000008.1.fasta')
        elif args.test == 'multi_ref':
            task.ref = Path.cwd().joinpath('test_data', 'adv_multi_ref.fasta')
        elif args.test == 'denovo':
            task.dehost = 'human'
            task.ref = None

    logger.info('Checking reference.')
    if task.ref != None:
        if check_ref_file(task):
            task.with_ref = True
        else:
            logger.error('Input reference not found. Exiting pipeline.')
            sys.exit()
    else:
        logger.info('Input reference not provided. Will go de novo')
    
    if task.with_ref == False:
        logger.info('Checking RVDB.')
        if utils.setup_rvdb() == -1:
            logger.error('RVDB setup error. Exiting pipeline.')
            sys.exit()

    if task.dehost != None:
        if utils.setup_genomes(task.dehost) == -1:
            logger.error('Host genome not found. Exiting pipeline.')
            sys.exit()
    
    logger.info('Checking reads files.')
    if check_reads_file(task) != -1:
        task.id = "%s_%s" % (task.name, time.strftime(
            "%Y%m%d%H%M", time.localtime()))
        logger.info('Creating new task %s.' % task.id)
        Path.mkdir(task.path.joinpath(task.id), parents=True)
        logger.info('Starting pipeline.')
        utils.write_log_file(
            task.path.joinpath(task.id),
            'Starting pipeline.'
        )

        # main pipeline
        reads_preprocess.run(task)
        reference_prepare.run(task)
        reads_alignment.run(task)
        variant_calling.run(task)
        logger.info('Pipeline finished.')
        utils.write_log_file(
            task.path.joinpath(task.id),
            'Pipeline finished.'
        )

        # report generator
        summary_generator.run(task)
        report_generator.run(task)
        
    else:
        logger.error('Reads not found. Exiting pipeline.')
        sys.exit()
Пример #18
0
def remove_host(task):
    logger.info('Removing host genome.')
    dehost_meta = {'genome': '', 'remove_percentage': ''}
    host_remove_cwd = task.path.joinpath(task.id, 'reads')
    Path.mkdir(host_remove_cwd, parents=True, exist_ok=True)

    if task.dehost == 'dog':
        dehost_meta['genome'] = 'Dog (Dog10K_Boxer_Tasha, GCF_000002285.5)'
        genome_path = '/app/genomes/' + 'dog10k'
    elif task.dehost == 'human':
        dehost_meta['genome'] = 'Human (GRCh38.p13, GCF_000001405.39)'
        genome_path = '/app/genomes/' + 'grch38'
    elif task.dehost == 'vero':
        dehost_meta['genome'] = 'Vero (Vero_WHO_p1.0, GCF_015252025.1)'
        genome_path = '/app/genomes/' + 'vero'
    elif task.dehost == 'chicken':
        dehost_meta['genome'] = 'Chicken (GRCg6a, GCF_000002315.6)'
        genome_path = '/app/genomes/' + 'grcg6a'
    elif task.dehost == 'rhesus_monkey':
        dehost_meta['genome'] = 'Rhesus monkey (Mmul_10, GCF_003339765.1)'
        genome_path = '/app/genomes/' + 'mmul_10'

    unconc_reads_out = task.id + '_host_removed_R%.fastq.gz'
    mapped_reads_out = 'host_mapped.sam'
    align_cmd = [
        'bowtie2', '-p',
        str(task.threads), '-x',
        str(genome_path), '-1',
        str(task.path.joinpath(task.id, 'reads',
                               task.id + '_R1.fastq.gz')), '-2',
        str(task.path.joinpath(task.id, 'reads', task.id + '_R2.fastq.gz')),
        '-S',
        str(mapped_reads_out), '--very-sensitive-local', '--un-conc-gz',
        '%s' % str(unconc_reads_out)
    ]
    logger.info('CMD: ' + ' '.join(align_cmd))
    utils.write_log_file(task.path.joinpath(task.id),
                         'CMD: ' + ' '.join(align_cmd))
    cmd_run = subprocess.run(align_cmd,
                             cwd=host_remove_cwd,
                             capture_output=True)
    # print(cmd_run.stdout.decode(encoding='utf-8'))
    print(cmd_run.stderr.decode(encoding='utf-8'))

    # build meta
    logger.info('Analysis BAM file from host mapped reads')
    # sorting
    sorting_cmd = [
        'samtools', 'sort', '-@', task.threads, 'host_mapped.sam', '-o',
        'host_mapped.sorted.bam'
    ]
    logger.info('CMD: ' + ' '.join(sorting_cmd))
    utils.write_log_file(task.path.joinpath(task.id),
                         'CMD: ' + ' '.join(sorting_cmd))
    sorting_run = subprocess.run(sorting_cmd,
                                 cwd=host_remove_cwd,
                                 capture_output=True)
    print(sorting_run.stdout.decode(encoding='utf-8'))
    print(sorting_run.stderr.decode(encoding='utf-8'))
    # flagstat
    flagstat_cmd = [
        'samtools', 'flagstat', '-@', task.threads, 'host_mapped.sorted.bam'
    ]
    logger.info('CMD: ' + ' '.join(flagstat_cmd))
    utils.write_log_file(task.path.joinpath(task.id),
                         'CMD: ' + ' '.join(flagstat_cmd))
    flagstat_run = subprocess.run(flagstat_cmd,
                                  cwd=host_remove_cwd,
                                  capture_output=True)
    stats_text = flagstat_run.stdout.decode(encoding='utf-8')
    stats_list = stats_text.split('\n')
    utils.build_text_file(task.path.joinpath(host_remove_cwd, 'flagstat.txt'),
                          stats_text)
    mapped_rate = stats_list[4].split(' ')[4][1:]
    dehost_meta['remove_percentage'] = mapped_rate
    utils.build_json_file(
        task.path.joinpath(host_remove_cwd, 'dehost_meta.json'), dehost_meta)
    # remove sam file to release disk space
    os.remove(task.path.joinpath(host_remove_cwd, mapped_reads_out))
    # remove host bam file to release disk space
    os.remove(task.path.joinpath(host_remove_cwd, 'host_mapped.sorted.bam'))
Пример #19
0
    pc_clean_sr_passed = round(clean_sr_results_list.count('True')/total_clips*100, 1)
    pc_noise_sr_passed = round(noise_sr_results_list.count('True')/total_clips*100, 1)
    pc_noisy_sr_passed = round(noisy_sr_results_list.count('True')/total_clips*100, 1)
    pc_clean_clipping_passed = round(clean_clipping_results_list.count('True')/total_clips*100, 1)
    pc_noise_clipping_passed = round(noise_clipping_results_list.count('True')/total_clips*100, 1)
    pc_noisy_clipping_passed = round(noisy_clipping_results_list.count('True')/total_clips*100, 1)

    print('% clips that passed SNR test:', pc_snr_passed)
    
    print('% clean clips that passed Normalization tests:', pc_clean_norm_passed)
    print('% noise clips that passed Normalization tests:', pc_noise_norm_passed)
    print('% noisy clips that passed Normalization tests:', pc_noisy_norm_passed)

    print('% clean clips that passed Sampling Rate tests:', pc_clean_sr_passed)
    print('% noise clips that passed Sampling Rate tests:', pc_noise_sr_passed)
    print('% noisy clips that passed Sampling Rate tests:', pc_noisy_sr_passed)

    print('% clean clips that passed Clipping tests:', pc_clean_clipping_passed)
    print('% noise clips that passed Clipping tests:', pc_noise_clipping_passed)
    print('% noisy clips that passed Clipping tests:', pc_noisy_clipping_passed)

    log_dir = utils.get_dir(cfg, 'unit_tests_log_dir', 'Unit_tests_logs')
    
    if not os.path.exists(log_dir):
        log_dir = os.path.join(os.path.dirname(__file__), 'Unit_tests_logs')
        os.makedirs(log_dir)
    
    utils.write_log_file(log_dir, 'unit_test_results.csv', [noisy_filenames_list, clean_filenames_list, \
                            noise_filenames_list, snr_results_list, clean_norm_results_list, noise_norm_results_list, \
                            noisy_norm_results_list, clean_sr_results_list, noise_sr_results_list, noisy_sr_results_list, \
                            clean_clipping_results_list, noise_clipping_results_list, noisy_clipping_results_list])
def main_body():
    '''Main body of this file'''

    parser = argparse.ArgumentParser()

    # Configurations: read noisyspeech_synthesizer.cfg and gather inputs
    parser.add_argument(
        '--cfg',
        default='pdns_synthesizer_icassp2022.cfg',
        help='Read noisyspeech_synthesizer.cfg for all the details')
    parser.add_argument('--cfg_str', type=str, default='noisy_speech')
    args = parser.parse_args()

    params = dict()
    params['args'] = args
    cfgpath = os.path.join(args.cfg)
    # os.path.join(os.path.dirname(__file__), args.cfg)
    assert os.path.exists(cfgpath), f'No configuration file as [{cfgpath}]'

    cfg = CP.ConfigParser()
    cfg._interpolation = CP.ExtendedInterpolation()
    cfg.read(cfgpath)
    params['cfg'] = cfg._sections[args.cfg_str]
    cfg = params['cfg']

    clean_dir = os.path.join('datasets/clean')

    if cfg['speech_dir'] != 'None':
        clean_dir = cfg['speech_dir']

    if not os.path.exists(clean_dir):
        assert False, ('Clean speech data is required')

    if cfg['speech_dir2'] != 'None':
        clean_dir2 = cfg['speech_dir2']

    if cfg['spkid_csv'] != 'None':
        spkid_csv = cfg['spkid_csv']

    if not os.path.exists(clean_dir2):
        assert False, ('Clean speech2 data is required')

    if cfg['rir_dir'] != 'None':
        rir_dir = cfg['rir_dir']

    if cfg['noise_dir'] != 'None':
        noise_dir = cfg['noise_dir']
    if not os.path.exists(noise_dir):
        assert False, ('Clean speech data is required')

    print(clean_dir)
    print(clean_dir2)
    print(noise_dir)
    print(spkid_csv)
    print(rir_dir)

    if cfg['noise_dir'] != 'None':
        noise_dir = cfg['noise_dir']
    if not os.path.exists:
        assert False, ('Noise data is required')

    params['fs'] = int(cfg['sampling_rate'])
    params['audioformat'] = cfg['audioformat']
    params['audio_length'] = float(cfg['audio_length'])
    params['silence_length'] = float(cfg['silence_length'])
    params['total_hours'] = float(cfg['total_hours'])

    # clean singing speech
    params['clean_singing'] = str(cfg['clean_singing'])
    params['singing_choice'] = int(cfg['singing_choice'])

    # rir
    params['rir_choice'] = int(cfg['rir_choice'])
    params['lower_t60'] = float(cfg['lower_t60'])
    params['upper_t60'] = float(cfg['upper_t60'])
    params['rir_table_csv'] = str(cfg['rir_table_csv'])
    params['clean_speech_t60_csv'] = str(cfg['clean_speech_t60_csv'])

    if cfg['fileindex_start'] != 'None' and cfg['fileindex_start'] != 'None':
        params['num_files'] = int(cfg['fileindex_end']) - int(
            cfg['fileindex_start'])
        params['fileindex_start'] = int(cfg['fileindex_start'])
        params['fileindex_end'] = int(cfg['fileindex_end'])
    else:
        params['num_files'] = int(
            (params['total_hours'] * 60 * 60) / params['audio_length'])
        params['fileindex_start'] = 0
        params['fileindex_end'] = int(params['num_files'])

    print('Number of files to be synthesized:', params['num_files'])

    params['is_test_set'] = utils.str2bool(cfg['is_test_set'])
    params['clean_activity_threshold'] = float(cfg['clean_activity_threshold'])
    params['noise_activity_threshold'] = float(cfg['noise_activity_threshold'])
    params['snr_lower'] = int(cfg['snr_lower'])
    params['snr_upper'] = int(cfg['snr_upper'])

    params['randomize_snr'] = utils.str2bool(cfg['randomize_snr'])
    params['target_level_lower'] = int(cfg['target_level_lower'])
    params['target_level_upper'] = int(cfg['target_level_upper'])

    if 'snr' in cfg.keys():
        params['snr'] = int(cfg['snr'])
    else:
        params['snr'] = int((params['snr_lower'] + params['snr_upper']) / 2)

    params['noisyspeech_dir'] = utils.get_dir(cfg, 'noisy_destination',
                                              'noisy')
    params['clean_proc_dir'] = utils.get_dir(cfg, 'clean_destination', 'clean')
    params['noise_proc_dir'] = utils.get_dir(cfg, 'noise_destination', 'noise')

    if 'speech_csv' in cfg.keys() and cfg['speech_csv'] != 'None':
        cleanfilenames = pd.read_csv(cfg['speech_csv'])
        cleanfilenames = cleanfilenames['filename']
    else:
        cleanfilenames = []
        for path in Path(cfg['speech_dir']).rglob('*.wav'):
            cleanfilenames.append(str(path.resolve()))

    selected_primary = []
    selected_secondary = []

    with open(spkid_csv, 'r') as file:
        my_reader = csv.reader(file, delimiter=',')
        for row in my_reader:
            if row[1] == 'primary':
                selected_primary.append(row)
            elif row[1] == 'secondary':
                selected_secondary.append(row)

    cleanfilenames = []
    for row in selected_primary:
        cleanfilenames.append(row[0])

    cleanfilenames2 = []
    for row in selected_secondary:
        cleanfilenames2.append(row[0])

    params['cleanfilenames'] = cleanfilenames

    shuffle(cleanfilenames2)
    params['cleanfilenames2'] = cleanfilenames2

    rirfilenames = []
    for path in Path(cfg['rir_dir']).rglob('*.wav'):
        rirfilenames.append(str(path.resolve()))

    shuffle(rirfilenames)
    params['myrir'] = rirfilenames

    if 'noise_csv' in cfg.keys() and cfg['noise_csv'] != 'None':
        noisefilenames = pd.read_csv(cfg['noise_csv'])
        noisefilenames = noisefilenames['filename']
    else:
        noisefilenames = glob.glob(
            os.path.join(noise_dir, params['audioformat']))

    if len(noisefilenames) != 0:
        shuffle(noisefilenames)
        params['noisefilenames'] = noisefilenames
    else:
        noisedirs = glob.glob(os.path.join(noise_dir, '*'))
        if cfg['noise_types_excluded'] != 'None':
            dirstoexclude = cfg['noise_types_excluded'].split(',')
            for dirs in dirstoexclude:
                noisedirs.remove(dirs)
        shuffle(noisedirs)
        params['noisedirs'] = noisedirs

    # Call main_gen() to generate audio
    clean_source_files, clean_clipped_files, clean_low_activity_files, \
    noise_source_files, noise_clipped_files, noise_low_activity_files = main_gen(params)

    # Create log directory if needed, and write log files of clipped and low activity files
    log_dir = utils.get_dir(cfg, 'log_dir', 'Logs')

    utils.write_log_file(log_dir, 'source_files.csv',
                         clean_source_files + noise_source_files)
    utils.write_log_file(log_dir, 'clipped_files.csv',
                         clean_clipped_files + noise_clipped_files)
    utils.write_log_file(log_dir, 'low_activity_files.csv', \
                         clean_low_activity_files + noise_low_activity_files)

    # Compute and print stats about percentange of clipped and low activity files
    total_clean = len(clean_source_files) + len(clean_clipped_files) + len(
        clean_low_activity_files)
    total_noise = len(noise_source_files) + len(noise_clipped_files) + len(
        noise_low_activity_files)

    pct_clean_clipped = round(len(clean_clipped_files) / total_clean * 100, 1)
    pct_noise_clipped = round(len(noise_clipped_files) / total_noise * 100, 1)
    pct_clean_low_activity = round(
        len(clean_low_activity_files) / total_clean * 100, 1)
    pct_noise_low_activity = round(
        len(noise_low_activity_files) / total_noise * 100, 1)

    print("Of the " + str(total_clean) + " clean speech files analyzed, " + \
          str(pct_clean_clipped) + "% had clipping, and " + str(pct_clean_low_activity) + \
          "% had low activity " + "(below " + str(params['clean_activity_threshold']*100) + \
          "% active percentage)")
    print("Of the " + str(total_noise) + " noise files analyzed, " + str(pct_noise_clipped) + \
          "% had clipping, and " + str(pct_noise_low_activity) + "% had low activity " + \
          "(below " + str(params['noise_activity_threshold']*100) + "% active percentage)")
Пример #21
0
def main_body():
    '''Main body of this file'''

    parser = argparse.ArgumentParser()

    # Configurations: read noisyspeech_synthesizer.cfg and gather inputs
    parser.add_argument('--cfg', default='noisyspeech_synthesizer.cfg',
                        help='Read noisyspeech_synthesizer.cfg for all the details')
    parser.add_argument('--cfg_str', type=str, default='noisy_speech')
    args = parser.parse_args()

    params = dict()
    params['args'] = args
    cfgpath = os.path.join(os.path.dirname(__file__), args.cfg)
    assert os.path.exists(cfgpath), f'No configuration file as [{cfgpath}]'

    cfg = CP.ConfigParser()
    cfg._interpolation = CP.ExtendedInterpolation()
    cfg.read(cfgpath)
    params['cfg'] = cfg._sections[args.cfg_str]
    cfg = params['cfg']

    clean_dir = os.path.join(os.path.dirname(__file__), 'datasets/clean')

    if cfg['speech_dir'] != 'None':
        clean_dir = cfg['speech_dir']
    if not os.path.exists(clean_dir):
        assert False, ('Clean speech data is required')

    noise_dir = os.path.join(os.path.dirname(__file__), 'datasets/noise')

    if cfg['noise_dir'] != 'None':
        noise_dir = cfg['noise_dir']
    if not os.path.exists:
        assert False, ('Noise data is required')

    params['fs'] = int(cfg['sampling_rate'])
    params['audioformat'] = cfg['audioformat']
    params['audio_length'] = float(cfg['audio_length'])
    params['silence_length'] = float(cfg['silence_length'])
    params['total_hours'] = float(cfg['total_hours'])
    
    # clean singing speech
    params['use_singing_data'] = int(cfg['use_singing_data'])
    params['clean_singing'] = str(cfg['clean_singing'])
    params['singing_choice'] = int(cfg['singing_choice'])

    # clean emotional speech
    params['use_emotion_data'] = int(cfg['use_emotion_data'])
    params['clean_emotion'] = str(cfg['clean_emotion'])
    
    # clean mandarin speech
    params['use_mandarin_data'] = int(cfg['use_mandarin_data'])
    params['clean_mandarin'] = str(cfg['clean_mandarin'])
    
    # rir
    params['rir_choice'] = int(cfg['rir_choice'])
    params['lower_t60'] = float(cfg['lower_t60'])
    params['upper_t60'] = float(cfg['upper_t60'])
    params['rir_table_csv'] = str(cfg['rir_table_csv'])
    params['clean_speech_t60_csv'] = str(cfg['clean_speech_t60_csv'])

    if cfg['fileindex_start'] != 'None' and cfg['fileindex_end'] != 'None':
        params['num_files'] = int(cfg['fileindex_end'])-int(cfg['fileindex_start'])
        params['fileindex_start'] = int(cfg['fileindex_start'])
        params['fileindex_end'] = int(cfg['fileindex_end'])
    else:
        params['num_files'] = int((params['total_hours']*60*60)/params['audio_length'])
        params['fileindex_start'] = 0
        params['fileindex_end'] = params['num_files']

    print('Number of files to be synthesized:', params['num_files'])
    
    params['is_test_set'] = utils.str2bool(cfg['is_test_set'])
    params['clean_activity_threshold'] = float(cfg['clean_activity_threshold'])
    params['noise_activity_threshold'] = float(cfg['noise_activity_threshold'])
    params['snr_lower'] = int(cfg['snr_lower'])
    params['snr_upper'] = int(cfg['snr_upper'])
    
    params['randomize_snr'] = utils.str2bool(cfg['randomize_snr'])
    params['target_level_lower'] = int(cfg['target_level_lower'])
    params['target_level_upper'] = int(cfg['target_level_upper'])
    
    if 'snr' in cfg.keys():
        params['snr'] = int(cfg['snr'])
    else:
        params['snr'] = int((params['snr_lower'] + params['snr_upper'])/2)

    params['noisyspeech_dir'] = utils.get_dir(cfg, 'noisy_destination', 'noisy')
    params['clean_proc_dir'] = utils.get_dir(cfg, 'clean_destination', 'clean')
    params['noise_proc_dir'] = utils.get_dir(cfg, 'noise_destination', 'noise')

    if 'speech_csv' in cfg.keys() and cfg['speech_csv'] != 'None':
        cleanfilenames = pd.read_csv(cfg['speech_csv'])
        cleanfilenames = cleanfilenames['filename']
    else:
        #cleanfilenames = glob.glob(os.path.join(clean_dir, params['audioformat']))
        cleanfilenames= []
        for path in Path(clean_dir).rglob('*.wav'):
            cleanfilenames.append(str(path.resolve()))

    shuffle(cleanfilenames)
#   add singing voice to clean speech
    if params['use_singing_data'] ==1:
        all_singing= []
        for path in Path(params['clean_singing']).rglob('*.wav'):
            all_singing.append(str(path.resolve()))
            
        if params['singing_choice']==1: # male speakers
            mysinging = [s for s in all_singing if ("male" in s and "female" not in s)]
    
        elif params['singing_choice']==2: # female speakers
            mysinging = [s for s in all_singing if "female" in s]
    
        elif params['singing_choice']==3: # both male and female
            mysinging = all_singing
        else: # default both male and female
            mysinging = all_singing
            
        shuffle(mysinging)
        if mysinging is not None:
            all_cleanfiles= cleanfilenames + mysinging
    else: 
        all_cleanfiles= cleanfilenames
        
#   add emotion data to clean speech
    if params['use_emotion_data'] ==1:
        all_emotion= []
        for path in Path(params['clean_emotion']).rglob('*.wav'):
            all_emotion.append(str(path.resolve()))

        shuffle(all_emotion)
        if all_emotion is not None:
            all_cleanfiles = all_cleanfiles + all_emotion
    else: 
        print('NOT using emotion data for training!')    
        
#   add mandarin data to clean speech
    if params['use_mandarin_data'] ==1:
        all_mandarin= []
        for path in Path(params['clean_mandarin']).rglob('*.wav'):
            all_mandarin.append(str(path.resolve()))

        shuffle(all_mandarin)
        if all_mandarin is not None:
            all_cleanfiles = all_cleanfiles + all_mandarin
    else: 
        print('NOT using non-english (Mandarin) data for training!')           
        

    params['cleanfilenames'] = all_cleanfiles
    params['num_cleanfiles'] = len(params['cleanfilenames'])
    # If there are .wav files in noise_dir directory, use those
    # If not, that implies that the noise files are organized into subdirectories by type,
    # so get the names of the non-excluded subdirectories
    if 'noise_csv' in cfg.keys() and cfg['noise_csv'] != 'None':
        noisefilenames = pd.read_csv(cfg['noise_csv'])
        noisefilenames = noisefilenames['filename']
    else:
        noisefilenames = glob.glob(os.path.join(noise_dir, params['audioformat']))

    if len(noisefilenames)!=0:
        shuffle(noisefilenames)
        params['noisefilenames'] = noisefilenames
    else:
        noisedirs = glob.glob(os.path.join(noise_dir, '*'))
        if cfg['noise_types_excluded'] != 'None':
            dirstoexclude = cfg['noise_types_excluded'].split(',')
            for dirs in dirstoexclude:
                noisedirs.remove(dirs)
        shuffle(noisedirs)
        params['noisedirs'] = noisedirs

    # rir 
    temp = pd.read_csv(params['rir_table_csv'], skiprows=[1], sep=',', header=None,  names=['wavfile','channel','T60_WB','C50_WB','isRealRIR'])
    temp.keys()
    #temp.wavfile

    rir_wav = temp['wavfile'][1:] # 115413
    rir_channel = temp['channel'][1:] 
    rir_t60 = temp['T60_WB'][1:] 
    rir_isreal= temp['isRealRIR'][1:]  

    rir_wav2 = [w.replace('\\', '/') for w in rir_wav]
    rir_channel2 = [w for w in rir_channel]
    rir_t60_2 = [w for w in rir_t60]
    rir_isreal2= [w for w in rir_isreal]
    
    myrir =[]
    mychannel=[]
    myt60=[]

    lower_t60=  params['lower_t60']
    upper_t60=  params['upper_t60']

    if params['rir_choice']==1: # real 3076 IRs
        real_indices= [i for i, x in enumerate(rir_isreal2) if x == "1"]

        chosen_i = []
        for i in real_indices:
            if (float(rir_t60_2[i]) >= lower_t60) and (float(rir_t60_2[i]) <= upper_t60):
                chosen_i.append(i)

        myrir= [rir_wav2[i] for i in chosen_i]
        mychannel = [rir_channel2[i] for i in chosen_i]
        myt60 = [rir_t60_2[i] for i in chosen_i]


    elif params['rir_choice']==2: # synthetic 112337 IRs
        synthetic_indices= [i for i, x in enumerate(rir_isreal2) if x == "0"]

        chosen_i = []
        for i in synthetic_indices:
            if (float(rir_t60_2[i]) >= lower_t60) and (float(rir_t60_2[i]) <= upper_t60):
                chosen_i.append(i)

        myrir= [rir_wav2[i] for i in chosen_i]
        mychannel = [rir_channel2[i] for i in chosen_i]
        myt60 = [rir_t60_2[i] for i in chosen_i]

    elif params['rir_choice']==3: # both real and synthetic
        all_indices= [i for i, x in enumerate(rir_isreal2)]

        chosen_i = []
        for i in all_indices:
            if (float(rir_t60_2[i]) >= lower_t60) and (float(rir_t60_2[i]) <= upper_t60):
                chosen_i.append(i)

        myrir= [rir_wav2[i] for i in chosen_i]
        mychannel = [rir_channel2[i] for i in chosen_i]
        myt60 = [rir_t60_2[i] for i in chosen_i]

    else:  # default both real and synthetic
        all_indices= [i for i, x in enumerate(rir_isreal2)]

        chosen_i = []
        for i in all_indices:
            if (float(rir_t60_2[i]) >= lower_t60) and (float(rir_t60_2[i]) <= upper_t60):
                chosen_i.append(i)

        myrir= [rir_wav2[i] for i in chosen_i]
        mychannel = [rir_channel2[i] for i in chosen_i]
        myt60 = [rir_t60_2[i] for i in chosen_i]

    params['myrir'] = myrir
    params['mychannel'] = mychannel
    params['myt60'] = myt60

    # Call main_gen() to generate audio
    clean_source_files, clean_clipped_files, clean_low_activity_files, \
    noise_source_files, noise_clipped_files, noise_low_activity_files = main_gen(params)

    # Create log directory if needed, and write log files of clipped and low activity files
    log_dir = utils.get_dir(cfg, 'log_dir', 'Logs')

    utils.write_log_file(log_dir, 'source_files.csv', clean_source_files + noise_source_files)
    utils.write_log_file(log_dir, 'clipped_files.csv', clean_clipped_files + noise_clipped_files)
    utils.write_log_file(log_dir, 'low_activity_files.csv', \
                         clean_low_activity_files + noise_low_activity_files)

    # Compute and print stats about percentange of clipped and low activity files
    total_clean = len(clean_source_files) + len(clean_clipped_files) + len(clean_low_activity_files)
    total_noise = len(noise_source_files) + len(noise_clipped_files) + len(noise_low_activity_files)
    pct_clean_clipped = round(len(clean_clipped_files)/total_clean*100, 1)
    pct_noise_clipped = round(len(noise_clipped_files)/total_noise*100, 1)
    pct_clean_low_activity = round(len(clean_low_activity_files)/total_clean*100, 1)
    pct_noise_low_activity = round(len(noise_low_activity_files)/total_noise*100, 1)

    print("Of the " + str(total_clean) + " clean speech files analyzed, " + \
          str(pct_clean_clipped) + "% had clipping, and " + str(pct_clean_low_activity) + \
          "% had low activity " + "(below " + str(params['clean_activity_threshold']*100) + \
          "% active percentage)")
    print("Of the " + str(total_noise) + " noise files analyzed, " + str(pct_noise_clipped) + \
          "% had clipping, and " + str(pct_noise_low_activity) + "% had low activity " + \
          "(below " + str(params['noise_activity_threshold']*100) + "% active percentage)")
Пример #22
0
                self.test_s5.append(value)
            elif k == 10:
                self.test_s10.append(value)
            else:
                print('cannot find !')
        write_log_file(
            self.log_path,
            "S@1, S@5, S@10\n{}, {}, {}".format(self.test_s1[-1],
                                                self.test_s5[-1],
                                                self.test_s10[-1]))


if __name__ == '__main__':
    all_time_1 = datetime.now()
    trainer = Trainer(arguments)
    if arguments.only_test:
        trainer.load_model(arguments.model_path)
    else:
        trainer.fit()
        trainer.load_model(trainer.best_model_path)

    all_time_1 = datetime.now()
    write_log_file(
        trainer.log_path,
        "finished to load the model, next to start to test and time is = {}".
        format(all_time_1))
    trainer.test(iter_no=trainer.max_iteration + 1)
    write_log_file(
        trainer.log_path,
        "\nAll Finished using ({})\n".format(datetime.now() - all_time_1))