Exemplo n.º 1
0
    def run(starting_from_here=False):
        if p.species_list:
            if not test_entrez_conn():
                log.error(
                    '   No internet connection: cannot fetch annotations.')
                return 4

            log.debug('   Using species list: ' + str(p.species_list))
            gb_ids = read_list(p.species_list)
            log.debug('species_list: ' + str(gb_ids))
            res = fetch_annotations_species_name_entrez(
                config.annotations_dir, gb_ids, p.proxy)
            if res != 0: return res
            return make_proteomes(config.annotations_dir, config.proteomes_dir)

        elif p.ids_list:
            if not test_entrez_conn():
                log.error('No internet connection: cannot fetch annotations.')
                return 4

            log.debug('   Using ref ids: ' + str(p.ids_list))
            ref_ids = read_list(p.ids_list)
            res = fetch_annotations_for_ids(config.annotations_dir, ref_ids)
            if res != 0: return res
            return make_proteomes(config.annotations_dir, config.proteomes_dir)

        else:
            proteomes, annotations = [], []

            if p.proteomes:
                proteomes, annotations = collect_proteomes_and_annotaitons(
                    p.proteomes)
                if proteomes == []:
                    interrupt('No fasta found in ' + p.proteomes)

            if p.annotations:
                proteomes, annotations = collect_proteomes_and_annotaitons(
                    p.annotations)
                if annotations == []:
                    interrupt('No gb files found in ' + p.annotations)

            #if not proteomes and not annotations:
            #    interrupt('Directory must contain fasta or genbank files.')
            #
            #if proteomes and annotations:
            #    log.warn('Directory %s contains both fasta and genbank files, using fasta.')

            if annotations:
                if not isdir(config.annotations_dir):
                    mkdir(config.annotations_dir)

                for annotation in annotations:
                    copy(annotation, config.annotations_dir)

                return make_proteomes(config.annotations_dir,
                                      config.proteomes_dir)

            elif proteomes:
                if not isdir(config.proteomes_dir):
                    mkdir(config.proteomes_dir)

                if p.download_anno:
                    if not test_entrez_conn():
                        #log.error('   Error: no internet connection, cannot fetch annotations. '
                        #          'You can start without a --no-fetch option, in this case '
                        #          'a reduced version of orthogroups.txt with no annotations will be produced.')
                        #return 1
                        log.error(
                            '   Warning: no internet connection, cannot fetch annotations. '
                            'A reduced version of orthogroups.txt with no annotations will be produced.'
                        )
                    else:
                        # ref_ids = [splitext(basename(prot_file))[0] for prot_file in proteomes]
                        # fetch_annotations_for_ids(config.annotations_dir, ref_ids)

                        gb_ids = [
                            splitext(basename(prot_file))[0]
                            for prot_file in proteomes
                        ]
                        log.debug('ids_list: ' + str(gb_ids))
                        res = fetch_annotations_for_ids(
                            config.annotations_dir, gb_ids, p.proxy)
                        if res > 0:
                            return res
                        if res == -1:
                            p.download_anno = False

                return adjust_proteomes(proteomes, config.proteomes_dir,
                                        p.prot_id_field)
Exemplo n.º 2
0
        def run(start_from_here=False):
            assemblies = [
                join(p.assemblies, f) for f in listdir(p.assemblies)
                if f and f[0] != '.'
            ]

            if isdir(config.proteomes_dir):
                assemblies = filter_dublicated_proteomes(
                    config.proteomes_dir, assemblies)
                if assemblies == []:
                    log.warn(all_considered_warning % config.proteomes_dir)
                    exit(1)

            assembly_names = [splitext(basename(asm))[0] for asm in assemblies]
            filtered_assemblies = [
                join(assemblies_dir, asm_name + '.fna')
                for asm_name in assembly_names
            ]
            new_proteomes = [
                join(config.proteomes_dir, asm_name + '.fasta')
                for asm_name in assembly_names
            ]

            if not isdir(assemblies_dir): mkdir(assemblies_dir)
            log.debug('   Created assemblies_dir ' + assemblies_dir)

            total_successful_filters = 0
            for assembly, filtered_asm in zip(assemblies, filtered_assemblies):
                if filter_assembly(assembly,
                                   filtered_asm,
                                   skip=(4, 7, 10, 23, 32, 38),
                                   skip_after=51) == 0:
                    total_successful_filters += 1
            if total_successful_filters == 0:
                log.error('No correct assemblies.')
                return 1

            for asm, prot, asm_name in zip(filtered_assemblies, new_proteomes,
                                           assembly_names):
                res = cmdline('prodigal',
                              parameters=[
                                  '-i', asm, '-o',
                                  join(config.intermediate_dir, asm_name),
                                  '-a', prot
                              ])()
                if res != 0:
                    return res
                log.info('')

            res = adjust_proteomes(new_proteomes,
                                   config.proteomes_dir,
                                   prot_id_field=0)
            if res != 0:
                return res

            # Recreate new_proteomes_directory
            if exists(new_proteomes_dir):
                rmtree(new_proteomes_dir)
            if not isdir(new_proteomes_dir):
                mkdir(new_proteomes_dir)
            for prot in new_proteomes:
                copy(prot, join(new_proteomes_dir, basename(prot)))

            return 0