def test_guess_format():
        'It test that we can guess the format for the sequence files'
        fhand = StringIO.StringIO('>fasta\nACTAG\n')
        assert guess_seq_file_format(fhand) == 'fasta'

        fhand = StringIO.StringIO('LOCUS AX0809\n')
        assert guess_seq_file_format(fhand) == 'genbank'

        fhand = StringIO.StringIO('@fastq\nACTAG\n')
        fhand.name = 'hola.sfastq'
        assert guess_seq_file_format(fhand) == 'fastq'

        fhand = StringIO.StringIO('@fastq\nACTAG\n+\nAt+AA')
        fhand.name = 'hola.fastq'
        assert guess_seq_file_format(fhand) == 'fastq'
Пример #2
0
def scrape_info_from_fname(path):
    'It guess pipeline taking into account the platform and the file format'
    if isinstance(path, basestring):
        fpath = path
    else:
        fpath = path.last_version

    fhand = open(fpath)
    basename = os.path.splitext(os.path.basename(fpath))[0]

    file_info = {}
    file_info['format'] = guess_seq_file_format(fhand)
    fhand.close()
    for item in basename.split('.'):
        if len(item) < 3 or item[2] != '_':
            continue
        key, value = item.split('_', 1)
        if key == 'pl':
            value = value.lower()
        file_info[key] = value
    file_info['fpath'] = path

    if file_info['pl'] not in ACCEPTED_PLATFORMS:
        msg = "The platform of your file({0:s}) is not".format(file_info['pl'])
        msg += 'in the accepted ones {0:s}'.format(ACCEPTED_PLATFORMS)
        raise RuntimeError(msg)


    return file_info
def create_cdna_intron_annotator(genomic_db, genomic_seqs_fhand):
    'It creates a function that annotates introns in cdna matching with genomic'
    genomic_seqs_fhand = get_fhand(genomic_seqs_fhand)
    genomic_seqs_index = SeqIO.index(genomic_seqs_fhand.name,
                                     guess_seq_file_format(genomic_seqs_fhand))
    def annotate_intron(sequence):
        'It adds the orf to the SeqFeatures'
        if sequence is None:
            return
        try:
            introns = infer_introns_for_cdna(sequence=sequence,
                                          genomic_db=genomic_db,
                                          genomic_seqs_index=genomic_seqs_index)
        except KeyError as error:
            error = str(error).lstrip('u').strip("'")
            if 'not found' in error:
                error += ' in seq file %s, but present in blast db %s' % \
                                           (genomic_seqs_fhand.name, genomic_db)
            raise RuntimeError(error)
        for intron_pos in introns:
            feature = SeqFeature(location=FeatureLocation(intron_pos,
                                                          intron_pos),
                                 type='intron',
                                 qualifiers={'genomic_db':genomic_db})
            sequence.features.append(feature)
        return sequence
    return annotate_intron
def make_backbone_blast_db(project_dir, blast_db_seq, dbtype):
    'It formats a blastdb when need it'
    logger = logging.getLogger(LOGGER_NAME)
    #the name should be the basename of the blast_db_seq
    db_dir = join(project_dir, BACKBONE_DIRECTORIES['blast_databases'])
    if not exists(db_dir):
        makedirs(db_dir)
    db_seq_fpath = join(db_dir, _get_basename(blast_db_seq))
    if not exists(db_seq_fpath):
        #which is the name of the new databae?
        blast_db_seq_format = guess_seq_file_format(open(blast_db_seq))
        if blast_db_seq_format == 'fasta':
            rel_symlink(blast_db_seq, db_seq_fpath)
        else:
            seqio(in_seq_fhand=open(blast_db_seq),
                  out_seq_fhand=open(db_seq_fpath, 'w'),
                  out_format='fasta')
        logger.info('Formatting the database %s' % db_seq_fpath)
        try:
            makeblastdb_plus(db_seq_fpath, dbtype=dbtype)
        except RuntimeError:
            msg = 'Error making blastdb. db:%s\n dbtype:%s\n' % \
                                               (db_seq_fpath, dbtype)
            remove(db_seq_fpath)
            raise RuntimeError(msg)
    return db_seq_fpath
Пример #5
0
def seq_pipeline_runner(pipeline, configuration, in_fhands, file_format=None,
                        writers=None, processes=False):

    '''It runs all the analysis for the given sequence pipeline.

    It takes one or two input files and one or two output files. (Fasta files
    with the sequence and quality).
    A working directory can be given in which the analysis intermediate files
    will be created. If not given a temporary directory will be created that
    will be removed once the analysis is completed.
    If the checkpoints are requested an intermediate file for every step will
    be created.
    '''
    if isinstance(pipeline, str):
        pipeline = PIPELINES[pipeline]

    if file_format is None:
        file_format = guess_seq_file_format(in_fhands['in_seq'])

    # Here we extract our input/output files
    in_fhand_seqs = in_fhands['in_seq']
    if 'in_qual' in in_fhands:
        in_fhand_qual = in_fhands['in_qual']
    else:
        in_fhand_qual = None

    # Here the SeqRecord generator is created
    processes = None if processes == 1 else processes
    if processes:
        temp_out_fhand = NamedTemporaryFile()
        temp_out_fpath = temp_out_fhand.name
        sequences = _parallel_process_sequences(in_fhand_seqs,
                                                in_fhand_qual,
                                                file_format, pipeline,
                                                configuration,
                                                processes, temp_out_fpath)
    else:
        temp_out_fhand = None
        sequences = _process_sequences(in_fhand_seqs, in_fhand_qual,
                                       file_format, pipeline,
                                       configuration)

    # The SeqRecord generator is consumed
    for sequence in sequences:
        for writer in writers.values():
            writer.write(sequence)

    # close and remove the temporary files
    if temp_out_fhand is not None:
        temp_out_fhand.close()

    # Some of the writers needs to close in order to finish its work
    feature_counter = {}
    for wtype, writer in writers.items():
        if 'close' in dir(writer):
            writer.close()
        feature_counter[wtype] = writer.num_features

    return feature_counter
Пример #6
0
 def __init__(self, reference_fhand, reads_fhand, output_fhand,
              keep_unmapped=True):
     "the initiator"
     self._reference_fhand = reference_fhand
     self._output_fhand    = output_fhand
     self._write_header()
     self._keep_unmapped = keep_unmapped
     format_ = guess_seq_file_format(reads_fhand)
     self._read_index = SeqIO.index(reads_fhand.name, format=format_)
def backbone_blast_runner(query_fpath, project_dir, blast_program,
                          blast_db=None, blast_db_seq=None, dbtype='nucl',
                          threads=False):
    '''It returns the blast if the results doesn't exist'''
    if blast_db is None and blast_db_seq is None:
        raise RuntimeError('It needs a blast database or seqfile')

    #create a logger
    logger = logging.getLogger(LOGGER_NAME)
    query_basename = _get_basename(query_fpath)
    blast_dir = join(project_dir, BACKBONE_DIRECTORIES['blast_dir'])

    if blast_db:
        result_dir = join(blast_dir, query_basename, _get_basename(blast_db))
    else:
        result_dir = join(blast_dir, query_basename,
                          _get_basename(blast_db_seq))
    if not exists(result_dir):
        makedirs(result_dir)
    result_fpath = join(result_dir,
                        '%s.%s.xml' % (BACKBONE_BASENAMES['blast_basename'],
                                       blast_program))
    if exists(result_fpath):
        logger.info('Using the stored blast result %s' % result_fpath)
        return result_fpath

    #the input file should be fasta
    fasta_query_fhand = None
    fasta_db_fhand = None
    if guess_seq_file_format(open(query_fpath)) != 'fasta':
        fasta_query_fhand = _create_temp_fasta_file(query_fpath)
        query_fpath = fasta_query_fhand.name

    #we have to create a database in BACKBONE_DIRECTORIES['blast_databases']
    if blast_db_seq:
        blast_db = make_backbone_blast_db(project_dir, blast_db_seq, dbtype)

    logger.info('Running the blast %s' % result_fpath)
    try:
        blast_runner_plus(query_fpath, blast_db, blast_program,
                                 result_fpath, threads=threads)
    except RuntimeError as error:
        if exists(result_fpath):
            remove(result_fpath)
        msg = '%s \n database: %s\n database type: %s' % (str(error),
                                                             blast_db, dbtype)
        raise RuntimeError(msg)

    if fasta_query_fhand:
        fasta_query_fhand.close()
    if fasta_db_fhand:
        fasta_db_fhand.close()

    return result_fpath
def main():
    'The main'
    # get parameters
    infhand, outfhand, rm_annots = set_parameters()

    # guess file format
    format_ = guess_seq_file_format(infhand)

    #remove annotations
    seqs = remove_annotation(infhand, format_, rm_annots)

    # write seqs in file
    write_seqs_in_file(seqs, seq_fhand=outfhand, format=format_)
Пример #9
0
    def run(self):
        '''It runs the analysis. It checks if the analysis is already done per
        input file'''
        self._log({'analysis_started':True})
        files_illumina = []
        files_454 = []
        files_sanger_with_qual = []
        files_sanger_without_qual = []
        for path in self._get_input_fpaths()['reads']:
            fpath = path.last_version
            fhand = open(fpath)
            fname = os.path.split(fpath)[-1]
            if 'pl_454' in fname.lower():
                files_454.append(fhand)
            if 'pl_illumina' in fname.lower():
                files_illumina.append(fhand)
            elif 'pl_sanger' in fname.lower():
                format_ = guess_seq_file_format(fhand)
                if format_ == 'fasta':
                    files_sanger_without_qual.append(fhand)
                elif format_ == 'fastq':
                    files_sanger_with_qual.append(fhand)

        #fastq are processed before
        files_sanger = files_sanger_with_qual[:]
        files_sanger.extend(files_sanger_without_qual)

        #all files should be fasta and fasta.qual
        output_dir = self._create_output_dirs()['assembly_input']
        project_name = self._get_project_name()
        for ext, files in (('_in.454', files_454),
                           ('_in.sanger', files_sanger),
                           ('_in.illumina', files_illumina),):
            base_name = os.path.join(output_dir, project_name + ext)
            fasta_fpath = base_name + '.fasta'
            qual_fpath = base_name + '.fasta.qual'
            if os.path.exists(fasta_fpath) or not files:
                continue
            fasta_fhand = open(fasta_fpath, 'w')
            qual_fhand = open(qual_fpath, 'w')
            self._cat_to_fasta(files, fasta_fhand, qual_fhand)
            fasta_fhand.close()
            qual_fhand.close()

        # close all files
        for file_ in files_454 + files_sanger + files_illumina:
            file_.close()
        self._log({'analysis_finished':True})
Пример #10
0
def scrape_info_from_fname(path):
    "It guess pipeline taking into account the platform and the file format"
    if isinstance(path, basestring):
        fpath = path
    else:
        fpath = path.last_version

    fhand = open(fpath)
    basename = os.path.splitext(os.path.basename(fpath))[0]

    file_info = {}
    file_info["format"] = guess_seq_file_format(fhand)
    fhand.close()
    for item in basename.split("."):
        if len(item) < 3 or item[2] != "_":
            continue
        key, value = item.split("_", 1)
        file_info[key] = value
    file_info["fpath"] = path
    return file_info
Пример #11
0
def seqio(in_seq_fhand, out_seq_fhand, out_format, double_encoding=False,
          in_qual_fhand=None, out_qual_fhand=None, in_format=None):
    'It converts format of the files'
    if not in_format:
        in_format = guess_seq_file_format(in_seq_fhand)
    if (in_qual_fhand is not None or
        out_qual_fhand is not None or
        in_format in ('repr', 'json', 'pickle') or
        out_format in ('repr', 'json', 'pickle')) :
        seqs = seqs_in_file(seq_fhand=in_seq_fhand,
                            qual_fhand=in_qual_fhand,
                            format=in_format, double_encoding=double_encoding)
        write_seqs_in_file(seqs, seq_fhand=out_seq_fhand,
                           qual_fhand=out_qual_fhand,
                           format=out_format)
    else:
        SeqIO.convert(in_seq_fhand, in_format, out_seq_fhand, out_format)
    out_seq_fhand.flush()
    if out_qual_fhand:
        out_qual_fhand.flush()
Пример #12
0
def create_unique_contiguous_region_filter(distance, genomic_db,
                                           genomic_seqs_fpath):
    '''It returns a filter that removes snv in a region that give more than one
    match or more than one match_parts'''
    parameters = {'database': genomic_db}
    blast_runner = create_runner(tool='blastn', parameters=parameters)
    blast_parser = get_alignment_parser('blast')
    match_filters = [{'kind'     : 'score_threshold',
                      'score_key': 'similarity',
                      'min_score': 90,
                     },
                     {'kind'            : 'min_length',
                      'min_num_residues': 20,
                      'length_in_query' : True
                     }
                    ]
    if not genomic_seqs_fpath:
        msg = 'No genomic sequence file defined for unique SNV filter'
        raise ValueError(msg)
    if not genomic_db:
        msg = 'No genomic blast database defined for unique SNV filter'
        raise ValueError(msg)
    genomic_seqs_fhand = open(genomic_seqs_fpath)
    genomic_seqs_index = SeqIO.index(genomic_seqs_fhand.name,
                                     guess_seq_file_format(genomic_seqs_fhand))

    def unique_contiguous_region_filter(sequence):
        '''It filters out the snv in regions repeated in the genome or
        discontiguous'''
        if sequence is None:
            return None

        for snv in sequence.get_features(kind='snv'):
            # Check if it is already done
            previous_result = _get_filter_result(snv, 'uniq_contiguous',
                                                 threshold=distance)
            if previous_result is not None:
                continue

            #we make a blast
            #with the sequence around the snv
            location = snv.location.start.position
            start = location - distance
            end = location + distance
            if start < 0:
                start = 0
            #print start, end
            seq_fragment = sequence[start:end]
            blast_fhand = blast_runner(seq_fragment)['blastn']
            #now we parse the blast
            blast_result = blast_parser(blast_fhand)
            alignments = filter_alignments(blast_result, config=match_filters)
            #are there any similar sequences?
            try:
                alignment = alignments.next()
                result = True
            except StopIteration:
                #if there is no similar sequence we assume that is unique
                result = False
            if result:
                #how many matches, it should be only one
                num_hits = len(alignment['matches'])

                if num_hits > 1:
                    result = True
                else:
                    #how many match parts have the first match?
                    #we could do it with the blast result, but blast is not very
                    #good aligning, so we realign with est2genome
                    blast_fhand.seek(0)
                    sim_seqs = similar_sequences_for_blast(blast_fhand)
                    sim_seq = sim_seqs[0] if sim_seqs else None

                    introns = infer_introns_for_cdna(sequence=seq_fragment,
                                          genomic_seqs_index=genomic_seqs_index,
                                              similar_sequence=sim_seq,
                                              genomic_db=genomic_db)
                    if introns:
                        result = True
                    else:
                        result = False

            blast_fhand.close()
            _add_filter_result(snv, 'uniq_contiguous', result, distance)
        return sequence

    return unique_contiguous_region_filter
Пример #13
0
 def test_staticmethod():
     'If an empty file is given it should not fail'
     fhand = StringIO.StringIO()
     assert guess_seq_file_format(fhand) is None