Python AlphabetInfo 예제들, taiyaki.alphabet.AlphabetInfo Python 예제들

예제 #1

0

파일 보기

파일: merge_mappedsignalfiles.py 프로젝트: zovoilis-lab/taiyaki

def main():
    args = parser.parse_args()
    first_in_fn = args.input[0]
    with MAPPED_SIGNAL_READER(first_in_fn) as hin:
        #  Copy alphabet and modification information from first file
        alph_info = alphabet.AlphabetInfo(*hin.get_alphabet_information())
    reads_written = set()
    print("Writing reads to ", args.output)
    with  MAPPED_SIGNAL_WRITER(args.output, alph_info) as hout:
        for infile in args.input:
            copied_from_this_file = 0
            with MAPPED_SIGNAL_READER(infile) as hin:
                check_version(hin, infile)
                in_alph_info = alphabet.AlphabetInfo(*hin.get_alphabet_information())
                if not alph_info.equals(in_alph_info):
                    raise Exception(
                        "Alphabet info in {} differs from that in {}".format(
                            infile, first_in_fn))
                for read_id in hin.get_read_ids():
                    if read_id in reads_written:
                        print("* Read", read_id,
                              "already present: not copying from", infile)
                    else:
                        readObject = hin.get_read(read_id)
                        readObject['read_id']=read_id
                        hout.write_read(readObject)
                        reads_written.add(read_id)
                        copied_from_this_file += 1
            print("Copied", copied_from_this_file, "reads from", infile)
    print("Copied", len(reads_written), "reads in total")

예제 #2

0

파일 보기

def _load_data(args, log):
    if args.input_strand_list is not None:
        read_ids = list(set(helpers.get_read_ids(args.input_strand_list)))
        log.write('* Will train from a subset of {} strands, determined ' +
                  'by read_ids in input strand list\n'.format(len(read_ids)))
    else:
        log.write('* Will train from all strands\n')
        read_ids = 'all'

    if args.limit is not None:
        log.write('* Limiting number of strands to {}\n'.format(args.limit))

    with mapped_signal_files.HDF5Reader(args.input) as per_read_file:
        (bases_alphabet, collapse_alphabet,
         mod_long_names) = per_read_file.get_alphabet_information()
        read_data = per_read_file.get_multiple_reads(read_ids,
                                                     max_reads=args.limit)
        # read_data now contains a list of reads
        # (each an instance of the Read class defined in
        # mapped_signal_files.py, based on dict)

    log.write('* Loaded {} reads.\n'.format(len(read_data)))

    alphabet_info = alphabet.AlphabetInfo(bases_alphabet,
                                          collapse_alphabet,
                                          mod_long_names,
                                          do_reorder=False)
    log.write('* Using alphabet definition: {}\n'.format(str(alphabet_info)))

    return read_data, alphabet_info

예제 #3

0

파일 보기

파일: test_mapped_signal_files.py 프로젝트: zovoilis-lab/taiyaki

    def test_check_HDF5_mapped_read_file(self):
        """Check that constructing a read object which doesn't conform
        leads to errors.
        """
        print("Creating flawed Read object from test data")
        read_dict = construct_mapped_read()
        read_dict['Reference'] = "I'm not a numpy array!"  # Wrong type!
        read_object = mapped_signal_files.Read(read_dict)
        print("Checking contents")
        check_text = read_object.check()
        print("Check result on read object: should fail")
        print(check_text)
        self.assertNotEqual(check_text, "pass")

        print("Writing to file")
        alphabet_info = alphabet.AlphabetInfo(DEFAULT_ALPHABET, DEFAULT_ALPHABET)
        with mapped_signal_files.HDF5Writer(self.testfilepath, alphabet_info) as f:
            f.write_read(read_object)

        print("Current dir = ", os.getcwd())
        print("File written to ", self.testfilepath)

        print("\nOpening file for reading")
        with mapped_signal_files.HDF5Reader(self.testfilepath) as f:
            ids = f.get_read_ids()
            print("Read ids=", ids[0])
            print("Version number = ", f.version)
            self.assertEqual(ids[0], read_dict['read_id'])

            file_test_report = f.check()
            print("Test report (should fail):", file_test_report)
            self.assertNotEqual(file_test_report, "pass")

예제 #4

0

파일 보기

def get_alphabet_info(model_info):
    flat_alphabet = model_info.output_alphabet[0]
    can_base = model_info.output_alphabet[0]
    for base in model_info.output_alphabet[1:]:
        if base in model_info.can_alphabet:
            can_base = base
        flat_alphabet += can_base
    mod_long_names = [] if len(model_info.mod_long_names) == 0 else \
        list(zip(*model_info.mod_long_names))[1]
    return alphabet.AlphabetInfo(model_info.output_alphabet,
                                 flat_alphabet,
                                 mod_long_names,
                                 do_reorder=True)

예제 #5

0

파일 보기

def main():
    """Main function to process mapping for each read using functions in prepare_mapping_funcs"""
    args = parser.parse_args()
    print("Running prepare_mapping using flip-flop remapping")

    if not args.overwrite:
        if os.path.exists(args.output):
            print("Cowardly refusing to overwrite {}".format(args.output))
            sys.exit(1)

    # Create alphabet and check for consistency
    modified_bases = [elt[0] for elt in args.mod]
    canonical_bases = [elt[1] for elt in args.mod]
    for b in modified_bases:
        assert len(
            b
        ) == 1, "Modified bases must be a single character, got {}".format(b)
        assert b not in args.alphabet, "Modified base must not be a canonical base, got {}".format(
            b)
    for b in canonical_bases:
        assert len(
            b
        ) == 1, "Canonical coding for modified bases must be a single character, got {}".format(
            b)
        assert b in args.alphabet, "Canonical coding for modified base must be a canonical base, got {}".format(
            b)
    full_alphabet = args.alphabet + ''.join(modified_bases)
    flat_alphabet = args.alphabet + ''.join(canonical_bases)
    modification_names = [elt[2] for elt in args.mod]

    alphabet_info = alphabet.AlphabetInfo(full_alphabet,
                                          flat_alphabet,
                                          modification_names,
                                          do_reorder=True)

    print("Converting references to labels using {}".format(
        str(alphabet_info)))

    # Make an iterator that yields all the reads we're interested in.
    fast5_reads = fast5utils.iterate_fast5_reads(
        args.input_folder,
        limit=args.limit,
        strand_list=args.input_strand_list,
        recursive=args.recursive)

    # Set up arguments (kwargs) for the worker function for each read
    kwargs = {}
    kwargs[
        'per_read_params_dict'] = prepare_mapping_funcs.get_per_read_params_dict_from_tsv(
            args.input_per_read_params)
    kwargs['model'] = helpers.load_model(args.model)
    kwargs['alphabet_info'] = alphabet_info
    kwargs['max_read_length'] = args.max_read_length
    kwargs['localpen'] = args.localpen

    # remaps a single read using flip-flip network
    workerFunction = prepare_mapping_funcs.oneread_remap

    def iter_jobs():
        references = bio.fasta_file_to_dict(args.references,
                                            alphabet=full_alphabet)
        for fn, read_id in fast5_reads:
            yield fn, read_id, references.get(read_id, None)

    if args.limit is not None:
        chunksize = args.limit // (2 * args.jobs)
        chunksize = int(np.clip(chunksize, 1, 50))
    else:
        chunksize = 50

    results = imap_mp(workerFunction,
                      iter_jobs(),
                      threads=args.jobs,
                      fix_kwargs=kwargs,
                      unordered=True,
                      chunksize=chunksize)

    # results is an iterable of dicts
    # each dict is a set of return values from a single read
    prepare_mapping_funcs.generate_output_from_results(results, args.output,
                                                       alphabet_info)

예제 #6

0

파일 보기

    parser.add_argument('--max',type=int,default=None)
    parser.add_argument('--calibration', action="store_true" )



    args = parser.parse_args()
    """
    with open(args.training_file,"r") as f:
        training_file = json.loads("".join(f.readlines()))
    """

    dataset = pd.read_csv(args.dataset,sep=";")
    #print("data",dataset)
    #print(dataset.columns)
    print(args.alphabet,args.collapsed_alphabet)
    alpha = alphabet.AlphabetInfo(args.alphabet, args.collapsed_alphabet,args.mod_long_names)

    with  MAPPED_SIGNAL_WRITER(args.output, alpha) as hout:

        for index, row in dataset.iterrows():

            print(row)

            new_alphabet = row["new_alphabet"]
            mod_long_names = row["mod_long_names"]
            canonical = row.get("canonical","T")

            threshold = row["threshold"]
            filter_section = row["filter_section"]
            lower_threshold = row.get("lower_threshold",False) #Under set to 0
            higher_threshold = row.get("higher_threshold",False)#higher set to higvalue

예제 #7

0

파일 보기

파일: json_to_checkpoint.py 프로젝트: udishadc/taiyaki

def parse_sublayer(sublayer):
    # TODO apply additional attributes (e.g. has_bias, convolutional padding)
    if sublayer['type'] == 'convolution':
        if sublayer['activation'] != 'tanh':
            sys.stderr.write((
                'Incompatible convolutional layer activation fucntion ' +
                '({}) encountered.\n').format(sublayer['type']))
            sys.exit(1)
        sys.stderr.write((
            'Loading convolutional layer with attributes:\n\tin size: {}\n' +
            '\tout size: {}\n\twinlen: {}\n\tstride: {}\n').format(
                sublayer['insize'], sublayer['size'], sublayer['winlen'],
                sublayer['stride']))
        layer = Convolution(
            sublayer['insize'], sublayer['size'], sublayer['winlen'],
            stride=sublayer['stride'], fun=tanh)
    elif sublayer['type'] == 'LSTM':
        sys.stderr.write((
            'Loading LSTM layer with attributes:\n\tin size: {}\n' +
            '\tout size: {}\n').format(
                sublayer['insize'], sublayer['size']))
        layer = Lstm(sublayer['insize'], sublayer['size'])
    elif sublayer['type'] == 'GruMod':
        sys.stderr.write((
            'Loading GRU layer with attributes:\n\tin size: {}\n' +
            '\tout size: {}\n').format(
                sublayer['insize'], sublayer['size']))
        layer = GruMod(sublayer['insize'], sublayer['size'])
    elif sublayer['type'] == 'reverse':
        sublayer = sublayer['sublayers']
        if sublayer['type'] == 'GruMod':
            sys.stderr.write((
                'Loading Reverse GRU layer with attributes:\n\tin size: {}\n' +
                '\tout size: {}\n').format(
                    sublayer['insize'], sublayer['size']))
            layer = Reverse(GruMod(sublayer['insize'], sublayer['size']))
        elif sublayer['type'] == 'LSTM':
            sys.stderr.write((
                'Loading Reverse LSTM layer with attributes:\n' +
                '\tin size: {}\n\tout size: {}\n').format(
                    sublayer['insize'], sublayer['size']))
            layer = Reverse(Lstm(sublayer['insize'], sublayer['size']))
        else:
            sys.stderr.write((
                'Invalid reversed-time layer type ({})\n').format(
                    sublayer['type']))
            sys.exit(1)
    elif sublayer['type'] == 'GlobalNormTwoState':
        nbase = nbase_flipflop(sublayer['size'])
        sys.stderr.write((
            'Loading flip-flop layer with attributes:\n\tin size: {}\n' +
            '\tnbases: {}\n').format(sublayer['insize'], nbase))
        layer = GlobalNormFlipFlop(sublayer['insize'], nbase)
    elif sublayer['type'] == 'GlobalNormTwoStateCatMod':
        output_alphabet = sublayer['output_alphabet']
        curr_can_base = 0
        collapse_alphabet = ''
        for can_i_nmod in sublayer['can_nmods']:
            collapse_alphabet += output_alphabet[curr_can_base] * (
                can_i_nmod + 1)
            curr_can_base += can_i_nmod + 1
        alphabet_info = alphabet.AlphabetInfo(
            output_alphabet, collapse_alphabet,
            sublayer['modified_base_long_names'], do_reorder=False)
        sys.stderr.write((
            'Loading modified bases flip-flop layer with attributes:\n' +
            '\tin size: {}\n\tmod bases: {}\n').format(
                sublayer['insize'], alphabet_info.mod_long_names))
        layer = GlobalNormFlipFlopCatMod(sublayer['insize'], alphabet_info)
    else:
        sys.stderr.write('Encountered invalid layer type ({}).\n'.format(
            sublayer['type']))
        sys.exit(1)

    layer = set_params(layer, sublayer['params'], sublayer['type'])

    return layer

예제 #8

0

파일 보기

 def get_alphabet_information(self):
     mod_long_names = self.hdf5.attrs['mod_long_names'].splitlines()
     return alphabet.AlphabetInfo(self.hdf5.attrs['alphabet'],
                                  self.hdf5.attrs['collapse_alphabet'],
                                  mod_long_names)

예제 #9

0

파일 보기

def get_alphabet_info(output_alphabet, collapse_alphabet, mod_long_names):
    return alphabet.AlphabetInfo(output_alphabet,
                                 collapse_alphabet,
                                 mod_long_names,
                                 do_reorder=True)

예제 #10

0

파일 보기

파일: test_mapped_signal_files.py 프로젝트: udishadc/taiyaki

    def test_HDF5_mapped_read_file(self):
        """Test that we can save a mapped read file and open it again

        Also produces a plot for diagnostic purposes
        """

        print("Creating Read object from test data")
        read_dict = construct_mapped_read_dict()
        read_object = signal_mapping.SignalMapping(**read_dict)
        print("Checking contents")
        check_text = read_object.check()
        print("Check result on read object:")
        print(check_text)
        self.assertEqual(check_text, "pass")

        print("Writing to file")
        with tempfile.NamedTemporaryFile(delete=False,
                                         dir=self.testset_work_dir) as fh:
            testfilepath = fh.name
        alphabet_info = alphabet.AlphabetInfo(DEFAULT_ALPHABET,
                                              DEFAULT_ALPHABET)
        with mapped_signal_files.MappedSignalWriter(testfilepath,
                                                    alphabet_info) as f:
            f.write_read(read_object.get_read_dictionary())

        print("Current dir = ", os.getcwd())
        print("File written to ", testfilepath)

        print("\nOpening file for reading")
        with mapped_signal_files.MappedSignalReader(testfilepath) as f:
            ids = f.get_read_ids()
            print("Read ids=", ids[0])
            print("Version number = ", f.version)
            self.assertEqual(ids[0], read_dict['read_id'])

            file_test_report = f.check()
            print("Test report:", file_test_report)
            self.assertEqual(file_test_report, "pass")

            read_list = list(f.reads())

        recovered_read = read_list[0]
        reflen = len(recovered_read.Reference)
        siglen = len(recovered_read.Dacs)

        # Get a chunk - note that chunkstart is relative to the start of
        # the mapped region, not relative to the start of the signal
        chunklen, chunkstart = 5, 3
        chunk = recovered_read.get_chunk_with_sample_length(
            chunklen, chunkstart)

        # Check that the extracted chunk is the right length
        self.assertEqual(chunk.sig_len, chunklen)

        # Check that the mapping data agrees with what we put in
        self.assertTrue(
            np.all(recovered_read.Ref_to_signal == read_dict['Ref_to_signal']))

        # Plot a picture showing ref_to_sig from the read object,
        # and the result of searches to find the inverse
        if False:
            plt.figure()
            plt.xlabel('Signal coord')
            plt.ylabel('Ref coord')
            ix = np.array([0, -1])
            plt.scatter(chunk.current[ix],
                        chunk.sequence[ix],
                        s=50,
                        label='chunk limits',
                        marker='s',
                        color='black')
            plt.scatter(recovered_read.Ref_to_signal,
                        np.arange(reflen + 1),
                        label='reftosig (source data)',
                        color='none',
                        edgecolor='blue',
                        s=60)
            siglocs = np.arange(siglen, dtype=np.int32)
            sigtoref_fromsearch = recovered_read.get_reference_locations(
                siglocs)
            plt.scatter(siglocs,
                        sigtoref_fromsearch,
                        label='from search',
                        color='red',
                        marker='x',
                        s=50)
            plt.legend()
            plt.grid()
            plt.savefig(self.plotfilepath)
            print("Saved plot to", self.plotfilepath)

예제 #11

0

파일 보기

파일: test_mapped_signal_files.py 프로젝트: udishadc/taiyaki

    def test_check_HDF5_mapped_read_file(self):
        """Check that constructing a read object which doesn't conform
        leads to errors.
        """
        print("Creating Read object from test data")
        valid_read_dict = construct_mapped_read_dict()
        valid_read_object = signal_mapping.SignalMapping(**valid_read_dict)
        print("Checking contents")
        check_text = valid_read_object.check()
        print("Check result on valid read object: should pass")
        print(check_text)
        self.assertEqual(check_text, signal_mapping.SignalMapping.pass_str)

        print("Creating flawed Read object from test data")
        invalid_read_dict = construct_mapped_read_dict()
        # set reference to incorrect length
        invalid_read_dict['Reference'] = np.zeros(
            len(invalid_read_dict['Reference']) - 1, dtype=np.int32)
        invalid_read_object = signal_mapping.SignalMapping(**invalid_read_dict)
        print("Checking contents")
        check_text = invalid_read_object.check()
        print("Check result on invalid read object: should fail")
        print(check_text)
        self.assertNotEqual(check_text, signal_mapping.SignalMapping.pass_str)

        print("Writing invalid read to file")
        alphabet_info = alphabet.AlphabetInfo(DEFAULT_ALPHABET,
                                              DEFAULT_ALPHABET)
        with tempfile.NamedTemporaryFile(delete=True,
                                         dir=self.testset_work_dir) as fh:
            testfilepath = fh.name
        with mapped_signal_files.MappedSignalWriter(testfilepath,
                                                    alphabet_info) as f:
            try:
                f.write_read(invalid_read_object.get_read_dictionary())
            except signal_mapping.TaiyakiSigMapError:
                pass
            else:
                self.assertTrue(False, 'Invalid read passed checks.')

        print("Writing valid read to file")
        with tempfile.NamedTemporaryFile(delete=False,
                                         dir=self.testset_work_dir) as fh:
            testfilepath = fh.name
        with mapped_signal_files.MappedSignalWriter(testfilepath,
                                                    alphabet_info) as f:
            try:
                f.write_read(valid_read_object.get_read_dictionary())
            except signal_mapping.TaiyakiSigMapError:
                self.assertTrue(False, 'Valid read failed checks.')

        print("Current dir = ", os.getcwd())
        print("File written to ", testfilepath)

        print("\nOpening valid file for reading")
        with mapped_signal_files.MappedSignalReader(testfilepath) as f:
            ids = f.get_read_ids()
            print("Read ids=", ids[0])
            print("Version number = ", f.version)
            self.assertEqual(ids[0], valid_read_dict['read_id'])

            file_test_report = f.check()
            print("Test report (should pass):", file_test_report)
            self.assertEqual(file_test_report,
                             signal_mapping.SignalMapping.pass_str)

예제 #12

0

파일 보기

파일: test_mapped_signal_files.py 프로젝트: zovoilis-lab/taiyaki

    def test_HDF5_mapped_read_file(self):
        """Test that we can save a mapped read file, open it again and
        use some methods to get data from it. Plot a picture for diagnostics.
        """

        print("Creating Read object from test data")
        read_dict = construct_mapped_read()
        read_object = mapped_signal_files.Read(read_dict)
        print("Checking contents")
        check_text = read_object.check()
        print("Check result on read object:")
        print(check_text)
        self.assertEqual(check_text, "pass")

        print("Writing to file")
        alphabet_info = alphabet.AlphabetInfo(DEFAULT_ALPHABET, DEFAULT_ALPHABET)
        with mapped_signal_files.HDF5Writer(self.testfilepath, alphabet_info) as f:
            f.write_read(read_object)

        print("Current dir = ", os.getcwd())
        print("File written to ", self.testfilepath)

        print("\nOpening file for reading")
        with mapped_signal_files.HDF5Reader(self.testfilepath) as f:
            ids = f.get_read_ids()
            print("Read ids=", ids[0])
            print("Version number = ", f.version)
            self.assertEqual(ids[0], read_dict['read_id'])

            file_test_report = f.check()
            print("Test report:", file_test_report)
            self.assertEqual(file_test_report, "pass")

            read_list = f.get_multiple_reads("all")

        recovered_read = read_list[0]
        reflen = len(recovered_read['Reference'])
        siglen = len(recovered_read['Dacs'])

        # Get a chunk - note that chunkstart is relative to the start of the mapped
        # region, not relative to the start of the signal
        chunklen, chunkstart = 5, 3
        chunkdict = recovered_read.get_chunk_with_sample_length(chunklen, chunkstart)

        # Check that the extracted chunk is the right length
        self.assertEqual(len(chunkdict['current']), chunklen)

        # Check that the mapping data agrees with what we put in
        self.assertTrue(np.all(recovered_read['Ref_to_signal']==read_dict['Ref_to_signal']))

        # Plot a picture showing ref_to_sig from the read object,    def setup():
        # and the result of searches to find the inverse
        if False:
            plt.figure()
            plt.xlabel('Signal coord')
            plt.ylabel('Ref coord')
            ix = np.array([0, -1])
            plt.scatter(chunkdict['current'][ix], chunkdict['sequence'][ix],
                        s=50, label='chunk limits', marker='s', color='black')
            plt.scatter(recovered_read['Ref_to_signal'], np.arange(reflen + 1), label='reftosig (source data)',
                        color='none', edgecolor='blue', s=60)
            siglocs = np.arange(siglen, dtype=np.int32)
            sigtoref_fromsearch = recovered_read.get_reference_locations(siglocs)
            plt.scatter(siglocs, sigtoref_fromsearch, label='from search', color='red', marker='x', s=50)
            plt.legend()
            plt.grid()
            plt.savefig(self.plotfilepath)
            print("Saved plot to", self.plotfilepath)

예제 #13

0

파일 보기

    else:
        #  Read sequences from .fa / .fasta file
        seq_dict = {
            int(seq.id): convert_seq(str(seq.seq), args.alphabet)
            for seq in SeqIO.parse(args.reference, "fasta")
        }
        log.write('* Loaded references from {}.\n'.format(args.reference))
        #  Write pickle for future
        pickle_name = os.path.splitext(args.reference)[0] + '.pkl'
        with open(pickle_name, 'wb') as fh:
            pickle.dump(seq_dict, fh)
        log.write(('* Written pickle of processed references to {} for ' +
                   'future use.\n').format(pickle_name))

    log.write('* Reading network from {}\n'.format(args.model))
    alphabet_info = alphabet.AlphabetInfo(args.alphabet, args.alphabet)

    model_kwargs = {
        'size': args.size,
        'stride': args.stride,
        'winlen': args.winlen,
        # Number of input features to model e.g. was >1 for event-based models
        # (level, std, dwell)
        'insize': 1,
        'alphabet_info': alphabet_info
    }
    model_metadata = {'reverse': False, 'standardize': True}
    network = helpers.load_model(args.model,
                                 model_metadata=model_metadata,
                                 **model_kwargs).to(device)
    log.write('* Network has {} parameters.\n'.format(

예제 #14

0

파일 보기

파일: merge_mappedsignalfiles.py 프로젝트: udishadc/taiyaki

def validate_and_merge_alphabets(in_fns):
    """ Validate that all alphabets are compatible. Alphabets can be
    incompatible if:
      1) Same mod_base corresponds to different can_base
      2) Same mod_base has different mod_long_names
      3) Same mod_long_name has different mod_bases

    Return the merge_alphabet_info object

    Also check file versions so this this doesn't short circuit a longer run.
    """
    all_alphabets = []
    for in_fn in in_fns:
        with MappedSignalReader(in_fn) as msr:
            all_alphabets.append(msr.get_alphabet_information())
            check_version(msr, in_fn)

    can_bases = all_alphabets[0].can_bases
    if not all((file_alphabet.can_bases == can_bases
                for file_alphabet in all_alphabets)):
        sys.stderr.write("All canonical alphabets must be the same for " +
                         "--allow_mod_merge. Got: {}\n".format(', '.join(
                             set(fa.can_bases for fa in all_alphabets))))
        sys.exit(1)

    all_mods, mod_long_names, mod_fns = {}, {}, {}
    for in_fn, file_alphabet in zip(in_fns, all_alphabets):
        for mod_base in file_alphabet.mod_bases:
            can_base = mod_base.translate(file_alphabet.translation_table)
            mod_long_name = file_alphabet.mod_name_conv[mod_base]
            if mod_base in all_mods:
                # if this mod base has been seen assert that all other
                # attributes agree
                if all_mods[mod_base] != (can_base, mod_long_name):
                    sys.stderr.write(
                        ('Incompatible modified bases encountered:\n\t' +
                         '{}={} (alt to {}) from {}\n\t' +
                         '{}={} (alt to {}) from {}\n').format(
                             mod_base, mod_long_name, can_base, in_fn,
                             mod_base, all_mods[mod_base][1],
                             all_mods[mod_base][0], mod_fns[mod_base]))
                    sys.exit(1)
            else:
                # if the mod_base has not been seen before, the long name must
                # also be unique
                if mod_long_name in mod_long_names:
                    sys.stderr.write(
                        ('Incompatible modified bases encountered:\n\t' +
                         '{}={} (alt to {}) from {}\n\t' +
                         '{}={} (alt to {}) from {}\n').format(
                             mod_base, mod_long_name, can_base, in_fn,
                             mod_long_names[mod_long_name], mod_long_name,
                             all_mods[mod_long_names[mod_long_name]][0],
                             mod_fns[mod_long_names[mod_long_name]]))
                    sys.exit(1)
                all_mods[mod_base] = (can_base, mod_long_name)
                mod_long_names[mod_long_name] = mod_base
                mod_fns[mod_base] = in_fn

    all_mods = [(mod_nase, can_b, mln)
                for mod_nase, (can_b, mln) in all_mods.items()]
    merge_alphabet = can_bases + ''.join(list(zip(*all_mods))[0])
    merge_collapse_alphabet = can_bases + ''.join(list(zip(*all_mods))[1])
    merge_mod_long_names = list(zip(*all_mods))[2]
    return alphabet.AlphabetInfo(merge_alphabet,
                                 merge_collapse_alphabet,
                                 merge_mod_long_names,
                                 do_reorder=True)