예제 #1
0
def trim_ORF(core_dna, end_j_seq):
    for i in range(1, 4):  #1, 2, 3
        if end_j_seq == Bio.Seq.translate(
                core_dna[len(core_dna) - 3 * len(end_j_seq) - i:len(core_dna) -
                         i]):
            return core_dna[:len(core_dna) - i]
    logger.error(
        f'\n{"#"*50}\nSomething went wrong with the ORF!!\n{core_dna}\n{Bio.Seq.transcribe(core_dna)}\n{end_j_seq}\n{"#"*50}'
    )
예제 #2
0
def get_meta_data(line_tokens, chain, isotype, core_dna, core_aa, cdr3,
                  best_v_family_col, best_d_family_col, best_j_family_col):

    if not re.match(chain + 'V\d+', line_tokens[best_v_family_col]):
        logger.error(line_tokens)
        logger.error(line_tokens[best_v_family_col])
    v_type = re.match(chain + 'V\d+', line_tokens[best_v_family_col]).group()
    d_type = 'unknown'
    if line_tokens[best_d_family_col]:  # d assignment is sometimes missing
        d_type = re.match(chain + 'D\d+',
                          line_tokens[best_d_family_col]).group()
    j_type = re.match(chain + 'J\d+', line_tokens[best_j_family_col]).group()

    return [chain, isotype, core_dna, core_aa, cdr3, v_type, d_type, j_type]
예제 #3
0
def remove_colliding_entries_from_default_dict(alternative_taxon_dict, default_taxon_dict):
    alternative_gene_names = get_gene_names(alternative_taxon_dict)
    default_gene_names = set(get_gene_names(default_taxon_dict))
    for gene_name in alternative_gene_names:
        if gene_name in default_gene_names:
            gene_to_remove = -1
            for i, gene_dict in enumerate(default_taxon_dict['genes']):
                if gene_dict['name'] == gene_name:
                    gene_to_remove = i
                    break
            sequence_to_remove = -1
            for i, sequence_dict in enumerate(default_taxon_dict['sequenceFragments']):
                if sequence_dict['uri'].endswith(gene_name):
                    sequence_to_remove = i
                    break
            if gene_to_remove > -1:
                if sequence_to_remove == -1:
                    logger.error(f'Did not find sequence for {gene_name}')
                default_taxon_dict['genes'].pop(gene_to_remove)
                default_taxon_dict['sequenceFragments'].pop(sequence_to_remove)
예제 #4
0
def verify_fastq_files_format(error_path, fastq1, fastq2):

    rep_num = os.path.split(os.path.split(fastq1)[0])[-1][-1] #/bioseq/data/results/asap/154832296135203243128690777655/reads/run1/R1.fastq

    num_lines_fastq1 = sum(1 for line in open(fastq1) if line.rstrip() != '')
    logger.info(f'{num_lines_fastq1} lines in {fastq1}')

    if num_lines_fastq1 % 4 != 0:
        err_msg = f'Illegal fastq file format: number of lines in {os.path.split(fastq1)[-1]} of rep {rep_num} is not a multiple of 4. One or more records are faulty.'
        logger.error(err_msg)
        with open(error_path, 'w') as error_path_f:
            error_path_f.write(err_msg)
        raise ValueError(err_msg)

    num_lines_fastq2 = sum(1 for line in open(fastq2) if line.rstrip() != '')
    logger.info(f'{num_lines_fastq2} lines in {fastq2}')

    if num_lines_fastq2 % 4 != 0:
        err_msg = f'Illegal fastq file format: number of lines in {os.path.split(fastq2)[-1]} of rep {rep_num} is not a multiple of 4. One or more records are faulty.'
        logger.error(err_msg)
        with open(error_path, 'w') as error_path_f:
            error_path_f.write(err_msg)
        raise ValueError(err_msg)

    if num_lines_fastq1 != num_lines_fastq2:
        err_msg = f'Illegal fastq files format: {os.path.split(fastq1)[-1]} and {os.path.split(fastq2)[-1]} of rep {rep_num} contain different number of lines ({num_lines_fastq1} and {num_lines_fastq2}, respectively).'
        logger.error(err_msg)
        with open(error_path, 'w') as error_path_f:
            error_path_f.write(err_msg)
        raise ValueError(err_msg)
예제 #5
0
def get_mixcr_cmds(lib_path, fastq_path, outpath, MMU, remote_run, error_path):

    if not os.path.exists(outpath):
        os.makedirs(outpath)

    logger.debug(f'fastq path: {fastq_path}')
    logger.debug(f'os.path.join(fastq_path, "R1.fastq"): {os.path.join(fastq_path, "R1.fastq")}')

    fastq1 = fastq2 = ''
    for file_name in os.listdir(fastq_path):
        if 'fastq' in file_name:
            if 'R1' in file_name:
                fastq1 = os.path.join(fastq_path, file_name)
            elif 'R2' in file_name:
                fastq2 = os.path.join(fastq_path, file_name)
    logger.info(f'fastq files paths are:\n{fastq1}\n{fastq2}')

    if not os.path.exists(fastq1):
        logger.error('R1.fastq is missing...')
        raise OSError('R1.fastq does not exist...')

    if not os.path.exists(fastq2):
        logger.error('R2.fastq is missing...')
        raise OSError('R2.fastq does not exist...')

    verify_fastq_files_format(error_path, fastq1, fastq2)

    vdjca_path = os.path.join(outpath, 'alignments.vdjca')
    clones_clns_path = os.path.join(outpath, 'clones.clns')

    align_cmd = ('mixcr align'                      #align command
                 ' -f'                                                          #overwrite output file if already exists
                 f' -s {"mouse" if MMU else "human"}'                                                       #consider species (mouse/human)
                 ' -c IGH,IGL,IGK'                                              #immunological chain gene(s) to align
                 #f' --report {outpath}/align_report.txt'                   #create report file
                 f' --library {lib_path.split(".json")[0]}'  # mixcr requires lib name without json suffix!!
                 ' -a'                                                          #save reads' ids from fastq files
                 ' --threads 4'                                                          #number of threads
                 #' --verbose'
                 f' {fastq1} {fastq2}'                               #input files- 2 X fastq files
                 f' {vdjca_path}')
             
    assemble_cmd = ('mixcr assemble'                    #assemble command
                    ' -r ' + outpath + '/assemble_report.txt'                       #create report file
                    ' -f'                                                           #overwrite output file if already exists
                    f' -i {outpath}/index_file'                                   #keep mapping between initial reads and final clones
                    ' -OseparateByC=true'                                           #separate by isotypes
                    ' --threads 4'                                                          #number of threads
                    #' -OcloneFactoryParameters.vParameters.featureToAlign=VRegion' #align v region and not v transcript
                    #' -OassemblingFeatures=[CDR3]'                   #define sequence to create clones by
                    #' -OminimalClonalSequenceLength=6'                             #minimum number of nucleotides in clonal sequence
                    f' {vdjca_path}'                                             #input file - VDJCA from previous step
                    f' {clones_clns_path}')                                         #output file

    exportAlignments_cmd = ('mixcr exportAlignments'    #exportAlignments command   
                            ' -f'                                        #overwrite output file if already exists
                            f' --preset-file {CONSTS.ASAP_EXEC+"/" if remote_run else ""}aln_fields.txt'         #export fields specified in aln_fields file
                            f' -cloneIdWithMappingType {outpath}/index_file'     #indicate stase of each read
                            f' {vdjca_path}'                                   #input file- VDJCA from previous step
                            f' {outpath}/alignments.txt')                           #output file

    exportClones_cmds = []
    # for chain in chains:
    #     exportClones_cmd = ('mixcr exportClones'                #exportClones command
    #                     ' -f'                                   #overwrite output file if already exists
    #                     f' --chains {chain}'
    #                     f' --preset-file {CONSTS.ASAP_EXEC+"/" if remote_run else ""}assemble_fields.txt'    #export fields specified in assemble_fields file
    #                     f' -readIds {outpath}/index_file'
    #                     ' -o'                                   #remove out-of-frame clones
    #                     ' -t'                                   #remove stop codon clones
    #                     f' {clones_clns_path}'                  #input file
    #                     f' {outpath}/{chain}_clones.txt')
    #     exportClones_cmds.append(exportClones_cmd)

    return align_cmd, assemble_cmd, exportAlignments_cmd, exportClones_cmds
예제 #6
0
def parse_alignment_file(mixcr_output_path, parsed_mixcr_output_path,
                         sequence_annotation_file_suffix,
                         mutations_file_suffix, len_threshold, qlty_threshold):
    '''parse alignment procedure'''
    # input: alignments file, path for output files, length threshold, quality thresholds of total sequence and of CDR3 region
    # output: none. creates output files as specified in "notes" file

    # column indices of the relevant data from mixcr's output (for more details see 'alignments.txt' file)
    overlapped_reads = 0
    quality = 1
    accession_number = 2
    DNA_FR1 = 4
    DNA_FR4 = 10
    AA_FR1 = 11
    AA_CDR3 = 16
    AA_FR4 = 17
    best_v_family = 23
    best_d_family = 24
    best_j_family = 25
    best_v_alignment = 31

    # dictionary to convert ASCII code to quality values
    ascii_to_quality_dict = {
        '!': 0,
        '"': 1,
        '#': 2,
        '$': 3,
        '%': 4,
        '&': 5,
        "'": 6,
        '(': 7,
        ')': 8,
        '*': 9,
        '+': 10,
        ',': 11,
        '-': 12,
        '.': 13,
        '/': 14,
        '0': 15,
        '1': 16,
        '2': 17,
        '3': 18,
        '4': 19,
        '5': 20,
        '6': 21,
        '7': 22,
        '8': 23,
        '9': 24,
        ':': 25,
        ';': 26,
        '<': 27,
        '=': 28,
        '>': 29,
        '?': 30,
        '@': 31,
        'A': 32,
        'B': 33,
        'C': 34,
        'D': 35,
        'E': 36,
        'F': 37,
        'G': 38,
        'H': 39,
        'I': 40,
        'J': 41,
        'K': 42
    }

    t1 = time.time()

    allowed_chain_types = ['IGH', 'IGK', 'IGL',
                           'unknown']  #do not use += or append here!

    total_lines = 1  # Shifted by one because of the header. More convenient when looking in notepad++...
    sequences_frequency_counter: {str: int} = {}

    # don't use dict.fromkeys here. Causes a BUG!!!
    chain_to_aa_read_to_meta_data_dict = dict(
        zip(allowed_chain_types, [{} for chain in allowed_chain_types]))
    chain_to_core_dna_to_mutations_info_dict = dict(
        zip(allowed_chain_types, [{} for chain in allowed_chain_types]))
    chain_to_core_aa_to_dna_reads_and_accession_numbers = dict(
        zip(allowed_chain_types, [{} for chain in allowed_chain_types]))
    pseudo_count = 1

    chain_to_count_dict = dict.fromkeys(allowed_chain_types, 0)
    isotypes_count_dict = dict.fromkeys(
        ['A', 'A1', 'A2', 'D', 'E', 'G', 'M', 'unknown'], 0)
    errors_count_dict = dict.fromkeys([
        'no_overlap', 'too_short_length', 'too_low_quality', 'missing_cdr3',
        'nonsense_stop_codon', 'inappropriate_end_j_seq'
    ], 0)

    alignments_txt_path = os.path.join(mixcr_output_path, 'alignments.txt')
    #alignments_filtered_txt_path = os.path.join(parsed_mixcr_output_path, 'alignments_filtered.txt')

    logger.info('Start parsing {}'.format(alignments_txt_path))
    with open(alignments_txt_path) as f:

        logger.info('File was opened succssefully.')
        # skip header-related variables- use to extract specified fields from alignments file
        header = f.readline()
        logger.info('First line of file is:\n{}'.format(header))

        #alignments_filtered_txt = header

        #iterate over alignments file line by line
        for line in f:

            # avoid a bug that happens when a tab is used instead of space
            line = re.sub('\t([012]:N:[012]:[012])', r' \1', line)
            #logger.debug('Next line of file is:\n{}'.format(line))
            line_tokens = line.split('\t')
            #count total number of entries provided by mixcr alignment
            total_lines += 1
            logger.debug(total_lines)

            if total_lines % 100000 == 0:
                logger.info('total_lines: {}'.format(total_lines))
            # If the first token contains two sequences (separated by a comma) it means that
            # MiXCR was unable to find an overlap between the two paired-end reads.
            if ',' in line_tokens[overlapped_reads]:
                errors_count_dict['no_overlap'] += 1
                continue

            chain = line_tokens[best_v_family][:3]

            # sanity check
            # if line_tokens[20][:3] != line_tokens[best_v_family_col][:3]:
            #     logger.debug(line)
            #     logger.debug(line[20][:3])
            #     logger.debug(line[best_v_family_col][:3])
            #     logger.debug('line[20][:3] != line[best_v_family_col][:3]')

            dna_read = line_tokens[overlapped_reads]
            # a combination that should generate the relevant part of the antibody dna
            # (from the end of the 5' primer until the end of the end_j_seq)
            #core_dna = line_tokens[6] + line_tokens[4] + line_tokens[10] + line_tokens[8] + line_tokens[14] + line_tokens[12] + line_tokens[16]
            core_dna = ''.join(line_tokens[DNA_FR1:DNA_FR4 + 1])

            # discard too short core dna's
            read_len = len(core_dna)
            if read_len < len_threshold:
                errors_count_dict['too_short_length'] += 1
                continue

            # discard low quality reads
            sequencing_quality = line_tokens[quality]
            #calculate average quality of read
            average_quality = sum(
                [ascii_to_quality_dict[k]
                 for k in sequencing_quality]) / read_len
            if average_quality < qlty_threshold:
                errors_count_dict['too_low_quality'] += 1
                continue

            #verify CDR3 is present
            cdr3 = line_tokens[AA_CDR3]
            if cdr3 == '':  # or '*' in cdr3 :
                errors_count_dict['missing_cdr3'] += 1
                continue

            # this should be the translation of the core_dna
            #core_aa = line_tokens[7] + line_tokens[5] + line_tokens[11] + line_tokens[9] + line_tokens[15] + line_tokens[13] + line_tokens[17]
            core_aa = ''.join(line_tokens[AA_FR1:AA_FR4 + 1])

            if logger.level <= 10:  # debug mode
                #sanity checks
                if core_aa != Bio.Seq.translate(
                        core_dna[:len(core_dna) // 3 * 3]):
                    logger.debug(
                        'core_aa is NOT identical to the translated core_dna')
                    logger.debug('core_aa:\n{}'.format(core_aa))
                    logger.debug('translated core_dna:\n{}'.format(
                        Bio.Seq.translate(core_dna)))
                if (core_dna
                        not in dna_read) and (not core_aa.endswith('VTVS_')):
                    logger.debug('dna_read:\n{}'.format(dna_read))
                    logger.debug('core dna:\n{}'.format(core_dna))
                if not core_aa.endswith(aa.end_j_seq):
                    logger.debug('end_j_seq after fixation is: {}'.format(
                        core_aa[-len(aa.end_j_seq[0]):]))

            # verify that core_aa is not non-sense
            if '*' in core_aa:
                logger.debug(
                    'line {} in alignment.txt file: STOP codon in core_aa!!!\n{}'
                    .format(total_lines, core_aa))
                errors_count_dict['nonsense_stop_codon'] += 1
                continue

            # verify that there is a proper end_j_seq.
            # MUST be after making sure that '*' is NOT in core_aa (otherwise it makes problems with the regex).
            if chain == 'IGH':
                has_end_j_seq = False
                for end_j_seq in aa.end_j_seq:  # aa.end_j_seq is a tuple with at least one string
                    if match_with_up_to_k_mismatches(core_aa[-len(end_j_seq):],
                                                     end_j_seq):
                        has_end_j_seq = True
                        break
                    if match_with_up_to_k_mismatches(
                            core_aa[-len(end_j_seq) - 1:-1], end_j_seq):
                        # in case of 'VTVSS_', remove last (full/partial) "codon"
                        core_aa = core_aa[:-1]
                        end_j_seq = core_aa[-len(
                            end_j_seq
                        ):]  # update current_end_j_seq. Maybe it's with one mismatch
                        core_dna = trim_ORF(
                            core_dna, end_j_seq
                        )  # sometimes it's a full codon, sometimes partial
                        has_end_j_seq = True
                        break

                if not has_end_j_seq:
                    logger.debug(
                        'IGH with no end_j_seq in core_aa:\n{}'.format(
                            core_aa))
                    errors_count_dict['inappropriate_end_j_seq'] += 1
                    # if core_aa[-6:-1] in aa.end_j_seq:
                    #     errors_count_dict['VTVSS_'] = errors_count_dict.get('VTVSS_',0) + 1
                    continue

            #no more filtrations after this point!!
            #alignments_filtered_txt += line

            if chain not in allowed_chain_types:
                logger.error('chain_type {} not in {}'.format(
                    chain, allowed_chain_types))
                logger.error(line_tokens)
                chain = 'unknown'

            # update chain counts
            chain_to_count_dict[chain] += 1

            if chain == 'IGH':
                isotype = get_isotype(dna_read, core_dna, end_j_seq)
                # update isotype counts
                isotypes_count_dict[isotype] += 1
            else:
                #No need to count these
                isotype = 'NONE'

            # update aa_sequence counts
            sequences_frequency_counter[
                core_aa] = sequences_frequency_counter.get(core_aa, 0) + 1

            # set annotation for the (unique) aa_sequence (only for the first time)
            if core_aa not in chain_to_aa_read_to_meta_data_dict[chain]:
                chain_to_aa_read_to_meta_data_dict[chain][
                    core_aa] = get_meta_data(line_tokens, chain, isotype,
                                             core_dna, core_aa, cdr3,
                                             best_v_family, best_d_family,
                                             best_j_family)

            # update mutation counts and Ka_Ks for the (unique) dna_sequence (only for the first time)
            if core_dna not in chain_to_core_dna_to_mutations_info_dict[chain]:
                #extract mutations field from column number $best_v_alignment_col that looks like this:
                #1|292|312|21|313|SG5CI8ASG15CSA36CSG90ASA91GDC95I98GSC143TSC148ASC218TSC259A|1288.0
                mutations_field = line_tokens[best_v_alignment].split("|")[5]
                update_mutation_count(
                    core_dna, mutations_field,
                    chain_to_core_dna_to_mutations_info_dict[chain],
                    pseudo_count
                )  # chain_to_core_dna_to_num_of_non_synonymous_mutations[chain], pseudo_count)

            # track mapping between each aa sequence and the reads behind it
            chain_to_core_aa_to_dna_reads_and_accession_numbers[chain][
                core_aa] = chain_to_core_aa_to_dna_reads_and_accession_numbers[
                    chain].get(core_aa, []) + [
                        (core_dna, line_tokens[accession_number])
                    ]

    # for chain in chain_to_core_dna_to_num_of_mutations:
    #     core_dna_to_num_of_mutations = chain_to_core_dna_to_num_of_mutations[chain]
    #     if core_dna_to_num_of_mutations != {}:
    #         mutation_counts_file = parsed_mixcr_output_path + '/' + chain + mutations_file_suffix
    #         write_dict_to_file(mutation_counts_file, core_dna_to_num_of_mutations)

    for chain in allowed_chain_types:
        core_dna_to_mutations_info_dict = chain_to_core_dna_to_mutations_info_dict[
            chain]
        if core_dna_to_mutations_info_dict != {}:
            mutations_info_file = parsed_mixcr_output_path + '/' + chain + mutations_file_suffix
            write_dict_to_file(mutations_info_file,
                               core_dna_to_mutations_info_dict,
                               value_type=list,
                               header='dna' + '\t' + ';'.join([
                                   'Ka_per_codon', 'Ks_per_codon',
                                   'number_of_baspair_mutations'
                               ]))

    #for chain in chain_to_aa_read_to_meta_data_dict:
        aa_read_to_meta_data_dict = chain_to_aa_read_to_meta_data_dict[chain]
        if aa_read_to_meta_data_dict != {}:
            with open(
                    parsed_mixcr_output_path + '/' + chain +
                    sequence_annotation_file_suffix, 'w') as f:
                f.write('\t'.join([
                    'chain', 'isotype', 'dna', 'aa', 'missing_cdr3', 'v_type',
                    'd_type', 'j_type', 'counts'
                ]) + '\n')
                for core_aa in aa_read_to_meta_data_dict:
                    f.write('\t'.join(
                        aa_read_to_meta_data_dict[core_aa] +
                        [str(sequences_frequency_counter[core_aa])]) + '\n')

        core_aa_to_dna_reads_and_accession_numbers = chain_to_core_aa_to_dna_reads_and_accession_numbers[
            chain]
        if core_aa_to_dna_reads_and_accession_numbers != {}:
            aa_to_read_and_accession_path = os.path.join(
                parsed_mixcr_output_path, chain + '_AA_to_DNA_reads.fasta')
            write_mapping_file(core_aa_to_dna_reads_and_accession_numbers,
                               aa_to_read_and_accession_path)

    t2 = time.time()

    logger.debug('sum(isotypes_count_dict.values():' +
                 str(sum(isotypes_count_dict.values())))

    outfile_report = parsed_mixcr_output_path + '/alignment_report.log'
    write_reports(outfile_report, t1, t2, errors_count_dict, total_lines,
                  chain_to_count_dict, isotypes_count_dict)

    outfile_pie_chart = outfile_report.replace('log', 'png')
    if isotypes_count_dict:
        generate_alignment_report_pie_chart(outfile_pie_chart,
                                            isotypes_count_dict)