Exemplo n.º 1
0
def gbk2fas(gbk_file):
    """Convert a Genban file to kFastA format."""
    record = load_genbank(gbk_file)
    fas_file = gbk_file[:gbk_file.find('.gbk')]+'.fas'
#    record.name = rec_name
#    record.id = rec_name
    write_fasta(fas_file, record)
    return fas_file
Exemplo n.º 2
0
def iter_align(coord_array, ref_rec, query_rec, aln_dir, segs_file):
    """Iterate through array of coordinates to make pairwise alignments."""
    # set up the root subdirectories
    seqs = aln_dir + "input_seqs/"
    alns = aln_dir + "output_alns/"
    ensure_dir([seqs, alns])
    aln_id = 0
    aln_len = 0
    # cycle through segments
    for segment_pair in coord_array:
        print ".",
        xa, xb, xc, xd = segment_pair
        # extract the corresponding sequence slices
        ref_seq = ref_rec[abs(xa):abs(xb)]
        query_seq = query_rec[abs(xc):abs(xd)]
        # reverse-complement sequences with negative sign
        if xa < 0:
            ref_seq = ref_seq.reverse_complement()
        if xc < 0:
            query_seq = query_seq.reverse_complement()
        # write sequences to file
        mscl_in = seqs + str(xa) + "_" + str(xb) + "_" + str(xc) + "_" + str(
            xd) + ".fas"
        write_fasta(mscl_in, [ref_seq, query_seq])
        # skip segments that are too small to align
        if abs(abs(xa) - abs(xb)) < 10:
            idp = 0
        else:
            # set up outfiles
            mscl_out = alns + str(xa) + "_" + str(xb) + "_" + str(
                xc) + "_" + str(xd) + ".aln"
            logfile = aln_dir + "muscle_log.txt"
            # perform alignment
            align_muscle(mscl_in, mscl_out, logfile)
            idntot = parse_clustal_idstars(mscl_out)
            idp = int((float(idntot) / len(query_seq)) * 100)
            aln_id += idntot
            aln_len += len(query_seq)
        # write details out to segments file
        line = "\t".join([str(xa), str(xb), str(xc), str(xd), str(idp) + "\n"])
        open(segs_file, 'a').write(line)
    overall_id = int((float(aln_id) / aln_len) * 100)
    print ""
    return overall_id
Exemplo n.º 3
0
def iter_align(coord_array, ref_rec, query_rec, aln_dir, segs_file):
    """Iterate through array of coordinates to make pairwise alignments."""
    # set up the root subdirectories
    seqs = aln_dir+"input_seqs/"
    alns = aln_dir+"output_alns/"
    ensure_dir([seqs, alns])
    aln_id = 0
    aln_len = 0
    # cycle through segments
    for segment_pair in coord_array:
        print ".",
        xa, xb, xc, xd = segment_pair
        # extract the corresponding sequence slices
        ref_seq = ref_rec[abs(xa):abs(xb)]
        query_seq = query_rec[abs(xc):abs(xd)]
        # reverse-complement sequences with negative sign
        if xa < 0 :
            ref_seq = ref_seq.reverse_complement()
        if xc < 0 :
            query_seq = query_seq.reverse_complement()
        # write sequences to file
        mscl_in = seqs+str(xa)+"_"+str(xb)+"_"+str(xc)+"_"+str(xd)+".fas"
        write_fasta(mscl_in, [ref_seq, query_seq])
        # skip segments that are too small to align
        if abs(abs(xa)-abs(xb)) < 10:
            idp = 0
        else:
            # set up outfiles
            mscl_out = alns+str(xa)+"_"+str(xb)+"_"+str(xc)+"_"+str(xd)+".aln"
            logfile = aln_dir+"muscle_log.txt"
            # perform alignment
            align_muscle(mscl_in, mscl_out, logfile)
            idntot = parse_clustal_idstars(mscl_out)
            idp = int((float(idntot)/len(query_seq))*100)
            aln_id += idntot
            aln_len += len(query_seq)
        # write details out to segments file
        line = "\t".join([str(xa), str(xb), str(xc), str(xd), str(idp)+"\n"])
        open(segs_file, 'a').write(line)
    overall_id = int((float(aln_id)/aln_len)*100)
    print ""
    return overall_id
Exemplo n.º 4
0
 def extract_segs_seqs(self, record, out_dir):
     count = 0
     for seg in self.segs:
         # unpack segment coords
         seg_start, seg_stop = seg['coords'][0], seg['coords'][1]
         # extract segment sequence
         segment = record[seg_start:seg_stop]
         if seg['strand'] < 0:
             segment = segment.reverse_complement()
         segment.id = self.name + "_" + seg['name']
         # write to individual file
         out_file = out_dir + self.name + "_" + seg['name'] + ".fas"
         write_fasta(out_file, segment)
         # record segment feature
         feat_loc = FeatureLocation(seg_start, seg_stop)
         feature = SeqFeature(location=feat_loc,
                              type='ref_seg',
                              qualifiers={'id': seg['name']})
         record.features.append(feature)
         count += 1
     return record
Exemplo n.º 5
0
 def extract_segs_seqs(self, record, out_dir):
     count = 0
     for seg in self.segs:
         # unpack segment coords
         seg_start, seg_stop = seg['coords'][0], seg['coords'][1]
         # extract segment sequence
         segment = record[seg_start:seg_stop]
         if seg['strand'] < 0:
             segment = segment.reverse_complement()
         segment.id = self.name+"_"+seg['name']
         # write to individual file
         out_file = out_dir+self.name+"_"+seg['name']+".fas"
         write_fasta(out_file, segment)
         # record segment feature
         feat_loc = FeatureLocation(seg_start, seg_stop)
         feature = SeqFeature(location=feat_loc,
                              type='ref_seg',
                              qualifiers={'id': seg['name']})
         record.features.append(feature)
         count +=1
     return record
Exemplo n.º 6
0
def basic_batch_blast(genomes, run_ref, blast_mode, r_root_dir, run_dirs,
                      fixed_dirs, blast_prefs, run_id, timestamp):
    """Send batch jobs to Blast. Muxes to multiple reference DBs."""
    # load inputs
    ref_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    in_root = run_root+run_dirs['ref_seg_dir']+ref_n+"/"
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Blast segs to genomes @", timestamp, "\n\n"])
    run_ref.log(logstring)
    # do blast
    for seg in run_ref.segs:
        input_file = in_root+ref_n+"_"+seg['name']+".fas"
        # translate if required
        if blast_mode == 'tn':
            record = load_fasta(input_file)
            record.seq = record.seq.translate()
            input_file = in_root+ref_n+"_"+seg['name']+"_aa.fas" # substitute
            write_fasta(input_file, record)
        out_dir = run_root+run_dirs['blast_out_dir']+ref_n+"/"+seg['name']+"/"
        ensure_dir([out_dir])
        print "\t", seg['name'],
        for genome in genomes:
            g_name = genome['name']
            db_path = fixed_dirs['blast_db_dir']+g_name
            outfile = out_dir+g_name+"_out.txt"
            print ".",
            if blast_mode == 'n':
                local_blastn_2file(input_file, db_path, outfile, blast_prefs)
            elif blast_mode == 'tx':
                local_tblastx_2file(input_file, db_path, outfile, blast_prefs)
            elif blast_mode == 'tn':
                local_tblastn_2file(input_file, db_path, outfile, blast_prefs)
        print ""
    run_ref.log("All OK")
    return "OK"
Exemplo n.º 7
0
def build_scaffolds(run_ref, r_root_dir, run_dirs, prox_D, separator, genomes,
                    run_id, timestamp, mtype, mode):
    """Build a scaffold of contigs based on the reference.

    This takes contigs that gave positive hits when blasted with reference
    segments. The contigs were aligned against the complete reference in a
    previous step for mapping purposes. Now the output of that step is re-used
    determine their position. A caveat is that if there are natural local
    rearrangements in the sequence relative to the reference, they may not be
    resolved appropriately. The problem is somewhat moderated by the fact that
    this function takes the best (usually the largest) hit region as "anchor"
    to position the contig within the scaffold. But if the rearranged region
    takes up a significant portion of the contig length, the anchoring will
    probably not be called correctly. Visual inspection of the finalized
    maps should help diagnose any such problems. The order can be fixed
    manually using the Mauve Contig Mover, which is part of Mauve 2.

    Note that not all hit contigs are "real" hits, so filtering should be
    applied before scaffolding to generate constructs.

    Model-based filtering produces a list of contigs that will be passed to
    the scaffolder. If filtering manually by looking at the maps,
    there are two options available: either select exclusively OR exclude a
    subset of contigs for the scaffolding process. This is done by listing
    their ID number in the genome dictionaries in the config file then
    resuming the pipeline from this step.

    """
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir + run_id + "/"
    ctgs_root = run_root + run_dirs['run_gbk_ctgs_dir'] + ref_n + "/"
    mauve_root = run_root + run_dirs['mauve_out_dir'] + ref_n + "/contigs/"
    scaffolds_dir = run_root + run_dirs['scaffolds_dir'] + ref_n + "/"
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Build scaffold constructs @", timestamp, "\n"])
    run_ref.log(logstring)
    # cycle through genomes
    for genome in genomes:
        # set inputs
        g_name = genome['name']
        ctgs_dir = ctgs_root + g_name + "/"
        print "\t", g_name, "...",
        # log
        logstring = "".join(["\n", g_name])
        run_ref.log(logstring)
        # set outputs
        mauve_dir = mauve_root + g_name + "/"
        ensure_dir([mauve_dir, scaffolds_dir])
        scaff_fas = scaffolds_dir + g_name + "_" + ref_n + "_scaffold.fas"
        scaff_gbk = scaffolds_dir + g_name + "_" + ref_n + "_scaffold.gbk"
        # list genbank files in matches directory
        dir_contents = listdir(ctgs_dir)
        anchors_array = np.zeros(1,
                                 dtype=[('ctg', 'i4'), ('start', 'i4'),
                                        ('end', 'i4'), ('orient', 'i2')])
        # identify contigs we want to select
        subset = []
        for item in dir_contents:
            pattern = re.compile(r'.*_(\d*)\.gbk$')
            match = pattern.match(item)
            if match:
                ctg_num = match.group(1)

                if mode == "exclude":
                    try:
                        if int(ctg_num) in genome[mode]:
                            msg = "(" + ctg_num + ")"
                            print msg,
                            run_ref.log(msg)
                        else:
                            subset.append(ctg_num)
                    except KeyError:
                        msg = "WARNING: no ignored segments list, including all"
                        print msg
                        msg = ctg_num
                        print msg,
                        subset.append(ctg_num)
                        run_ref.log(msg)

                elif mode == "select":
                    try:
                        if int(ctg_num) in genome[mode]:
                            msg = ctg_num
                            print msg,
                            run_ref.log(msg)
                            subset.append(ctg_num)
                        else:
                            msg = "(" + ctg_num + ")"
                            print msg,
                            run_ref.log(msg)
                    except KeyError:
                        msg = "WARNING: no selected segments list, including all"
                        print msg
                        msg = ctg_num
                        print msg,
                        subset.append(ctg_num)
                        run_ref.log(msg)

        # at this point we should have a subset of contigs selected
        for ctg_num in subset:
            logstring = "".join(["\t", ctg_num])
            run_ref.log(logstring)
            # set inputs
            mauve_file = mauve_dir + ctg_num + ".mauve"
            bb_file = mauve_file + ".backbone"
            try:
                # parse Mauve output
                coords = mauver_load2_k0(bb_file, prox_D, mtype)
                # determine which segment to use as anchor
                anchor_seg = get_anchor_loc(coords)
                anchors_array = np.insert(
                    anchors_array, 0,
                    (ctg_num, anchor_seg['start'], anchor_seg['end'],
                     anchor_seg['orient']))
            except IOError:
                msg = "\tERROR: Mauve alignment not found\n\t"
                print msg
                run_ref.log(msg)
            except Exception:
                msg = "\tERROR: Iteration failure\n\t"
                print msg
                run_ref.log(msg)

        # abort if there is no valid contig to proceed with
        try:
            assert len(anchors_array) > 1  # always 1 left from stub
        except AssertionError:
            msg = "\tWARNING: Contig list empty\n\t"
            print msg
            run_ref.log(msg)
        else:
            # order contigs by anchor location
            anchors_array = np.sort(anchors_array, order='start')
            # load contig records from the genbank files in the matches directory
            ctg_list = []
            for ctg_anchor in anchors_array:
                ctg_num = ctg_anchor['ctg']
                if ctg_num > 0:
                    contig_gbk = ctgs_dir + g_name + "_" + str(
                        ctg_num) + ".gbk"
                    record = load_genbank(contig_gbk)
                    if ctg_anchor['orient'] == -1:  # flip record
                        record = record.reverse_complement(id=True,
                                                           name=True,
                                                           annotations=True,
                                                           description=True)
                    ctg_list.append(record)
                else:  # workaround for having 0 value leftover from stub
                    pass  # having it might come in handy in later dev
            # output scaffold files
            write_fasta(scaff_fas, ctg_list)
            scaff_record = SeqRecord('', id='temp')
            scaff_bumper = SeqRecord(separator, id='join')
            for record in ctg_list:
                feat_start = len(scaff_record.seq)
                scaff_record += record
                feat_stop = len(scaff_record.seq)
                scaff_record += scaff_bumper
                feat_loc = FeatureLocation(feat_start, feat_stop)
                pattern = re.compile(r'.*_(\d*)$')
                match = pattern.match(record.id)
                try:
                    ctg_num = match.group(1)
                except Exception:
                    ctg_num = 'N'
                feature = SeqFeature(location=feat_loc,
                                     type='contig',
                                     qualifiers={'id': ctg_num})
                scaff_record.features.append(feature)
            scaff_record.description = g_name + " scaffold from " + ref_n
            try:
                scaff_record.id = g_name
                write_genbank(scaff_gbk, scaff_record[:-100])  # rm last bumper
            except ValueError:
                scaff_record.id = g_name[:10]
                write_genbank(scaff_gbk, scaff_record[:-100])  # rm last bumper
            print ""
Exemplo n.º 8
0
def unpack_genomes(genome, separator, fixed_dirs, ctg_thresholds):
    """Unpack genome files.

    Here, unpacking means extracting data and producing specific files to
    standardize how the information is made available to downstream analysis.
    Depending on the input file format, different unpacking methods are
    invoked. In all cases, this ensures that for each genome, there is a
    multifasta file of the contigs all together as well as a separate Genbank
    file for each contig.

    Supported input file formats are the following:
    - mfas: Basic whole genome sequence in multifasta file of contigs. This
    can be used to process a finished genome in a single Fasta file as well.
    - cgbk: All contigs concatenated in a single GenBank file (Genoscope,
    French WGS). This can be used to process a finished genome in a single
    GanBank file as well.
    # TODO: provide support for other possible input formats

    Unpacking 'cgbk' genomes involves an initial step to detect occurrences
    of the sequence separator and collect the start and stop coordinates of
    each contig. Each pair of coordinates can then be used to extract the
    contig sequence and create a SeqRecord for that contig, which SeqIO
    normally does when it unpacks multifasta files.

    """
    # set up inputs
    infile = genome['file']  #TODO: make GUI input loader (upstream)
    inpath = fixed_dirs['ori_g_dir'] + infile
    g_name = genome['name']
    print " ", g_name, "...",
    # prep output destinations
    mfas_dir = fixed_dirs['mfas_contigs_dir']
    fas_dir = fixed_dirs['fas_contigs_dir'] + g_name + "/"
    ensure_dir([mfas_dir, fas_dir])
    mfas_file = mfas_dir + g_name + "_contigs.fas"
    records = []
    # select unpacking method
    if genome['input'] is 'fas':
        try:
            path.exists(inpath) is True
        except ValueError:
            raise Exception("Bad input file path")
        genome_recs = load_multifasta(inpath)
        # generate GenBank files
        counter = 0
        for rec in genome_recs:
            counter += 1
            ctg_num = str(counter)
            new_id = g_name + "_" + ctg_num  # workaround for long ids
            new_seq = rec.seq
            new_seq.alphabet = generic_dna
            new_rec = SeqRecord(seq=new_seq, id=new_id)
            records.append(new_rec)  # for multifasta output
            fas_file = fas_dir + new_id + ".fas"
            write_fasta(fas_file, new_rec)
    elif genome['input'] is 'gbk':
        # load in genome data
        genome_rec = load_genbank(inpath)
        g_string = genome_rec.seq
        # find split coordinates
        coord_pairs = multisplit_finder(g_string, separator)
        # split record
        counter = 0
        for (start, stop) in coord_pairs:
            counter += 1
            ctg_num = str(counter)
            new_record = genome_rec[start:stop]
            new_record.id = g_name + "_" + ctg_num
            records.append(new_record)  # for multifasta output
            fas_file = fas_dir + g_name + "_" + ctg_num + ".fas"
            write_fasta(fas_file, new_record)
    else:
        xmsg = "Input file format " + genome[
            'input'] + " unspecified/unsupported"
        raise Exception(xmsg)
    print counter, "contigs"
    # write master file
    write_fasta(mfas_file, records)
    # pass records to stats logger
    ctg_stats(g_name, fixed_dirs, ctg_thresholds, records)
Exemplo n.º 9
0
def batch_contig_annot(dataset):
    """Extract and annotate contigs."""
    # identify dataset contig file
    contigs_file = dirs['assembly_dir']+dataset['f_nick']+'/'+'contigs.fa'
    # locate the COG database
    cog_db = dirs['blast_db_dir']+'Cog_LE/Cog'
    # make the training file
    training_file = dirs['annot_dir']+dataset['f_nick']+'/'+'contigs.trn'
    #train_prodigal(contigs_file, training_file)
    # set output dirs
    fas_out_dir = dirs['annot_dir']+dataset['f_nick']+'/fasta/'
    gbk_out_dir = dirs['annot_dir']+dataset['f_nick']+'/predict/'
    aa_out_dir = dirs['annot_dir']+dataset['f_nick']+'/aa/'
    blast_out_dir = dirs['annot_dir']+dataset['f_nick']+'/rpsblast/'
    solid_out_dir = dirs['annot_dir']+dataset['f_nick']+'/genbank/'
    maps_out_dir = dirs['annot_dir']+dataset['f_nick']+'/maps/'
    ensure_dir(fas_out_dir)
    ensure_dir(gbk_out_dir)
    ensure_dir(aa_out_dir)
    ensure_dir(blast_out_dir)
    ensure_dir(solid_out_dir)
    # set phage hit collector
    contig_hits = {}
    sp_hit_list = dirs['annot_dir']+dataset['f_nick']+'/'\
                                   +dataset['f_nick']+'_kw_hits.html'
    all_hit_list = dirs['annot_dir']+dataset['f_nick']+'/'\
                                    +dataset['f_nick']+'_all_hits.html'
    sp_hit_list_handle = open(sp_hit_list, 'w')
    all_hit_list_handle = open(all_hit_list, 'w')
    sp_hit_list_handle.write("<ul>")
    all_hit_list_handle.write("<ul>")
    # load all contigs
    contigs_list = load_multifasta(contigs_file)
    # cycle through contigs
    ctg_count = 0
    gene_count = 0
    for contig in contigs_list:
        ctg_count +=1
        # use regex to acquire relevant record ID info
        pattern = re.compile(r'NODE_(\d*)_length_(\d*)_cov_(\d*)')
        match = pattern.match(contig.id)
        nick = match.group(1)+'_'+match.group(2)+'_'+match.group(3)
        contig.id = nick
        fasta_out = fas_out_dir+nick+'.fas'
        # write record to file
        write_fasta(fasta_out, contig)
        # create contig entry in dict
        contig_hits[nick] = []
        # run the annotation
        annot_gbk = gbk_out_dir+nick+'.gbk'
        annot_aa = aa_out_dir+nick+'.fas'
        #run_prodigal(fasta_out, annot_gbk, annot_aa, training_file)
        # blast the amino acids against COG
        print '\tblasting', dataset['f_nick'], nick
        blast_out = blast_out_dir+nick+'.xml'
        if path.isfile(blast_out):
            print "\t\talready blasted"
        else:
            local_rpsblast_2file(annot_aa, cog_db, blast_out, blast_prefs)
        # collect best hits
        rec_cogs = collect_cogs(blast_out)
        map_file = maps_out_dir+nick+'.pdf'
        # consolidate annotated genbank file
        record = load_fasta(fasta_out)
        aa_defs = load_multifasta(annot_aa)
        features = []
        counter = 1
        ctg_flag_1 = 0
        ctg_flag_2 = 0
        for protein in aa_defs:
            gene_count +=1
            # get feature details from description line
            # necessary because the prodigal output is not parser-friendly
            pattern = re.compile(r'\d+_\d+_\d+_\d+_\d+\s+\S+\s+(\d+)\s+\S+\s+(\d+)\s+\S+\s+(\S*\d)')
            match = pattern.match(protein.description)
            start_pos = int(match.group(1))
            end_pos = int(match.group(2))
            strand_pos = int(match.group(3))
            feat_loc = FeatureLocation(start_pos, end_pos)
            annotation = rec_cogs['Query_'+str(counter)]
            if ctg_flag_1 is 0:
                all_hit_list_handle.write("</ul><br><a href='"
                                          +"../../../../"
                                          +map_file
                                          +"'>Contig "
                                          +nick+"</a><ul>")
                ctg_flag_1 = 1
            all_hit_list_handle.write("<li>"+str(counter)
                                            +'. '+annotation+"</li>")
            # detect phage content in annotation
            phi_pattern = re.compile(r".+(COG\d+).+"
                                      "(phage|capsid|muramidase|tail|"
                                      "replication|helicase|polymerase|"
                                      "integrase|recombinase"
                                      "suppressor|hydrolase|transposase).+",
                                     re.IGNORECASE)
            phi_match = phi_pattern.match(annotation)
            if phi_match:
                hit_flag = 'on'
                hit_dict = {'CDS': counter,
                            'annot': annotation,
                            'COGs': phi_match.group}
                contig_hits[nick].append(hit_dict)
                # write out to summary file
                if ctg_flag_2 is 0:
                    sp_hit_list_handle.write("</ul><br><a href='"
                                             +"../../../../"
                                             +map_file
                                             +"'>Contig "
                                             +nick+"</a><ul>")
                    ctg_flag_2 = 1
                sp_hit_list_handle.write("<li>"+str(counter)
                                          +'. '+annotation+"</li>")
            else:
                hit_flag = 'off'
            # consolidation feature annotations
            quals = {'note': protein.description,
                     'fct': annotation,
                     'flag': hit_flag}
            feature = SeqFeature(location=feat_loc,
                                 strand=strand_pos,
                                 id=protein.id,
                                 type='CDS',
                                 qualifiers=quals)
            features.append(feature)
            counter +=1
        record.features = features
        record.description = dataset['f_nick']+'_contig_'+nick
        record.name = nick
        record.dbxrefs = ['Project:np1']
        record.seq.alphabet = generic_dna
        gbk_out = solid_out_dir+nick+'.gbk'
        write_genbank(gbk_out, record)
        # generate graphical map
        ContigDraw(nick, gbk_out, map_file)
    sp_hit_list_handle.write("</ul>")
    all_hit_list_handle.write("</ul>")
    sp_hit_list_handle.close()
    all_hit_list_handle.close()
    print "\t", gene_count, "predicted genes in", ctg_count, "contigs"
Exemplo n.º 10
0
def process_ref(ref, ref_annot_flag, r_root_dir, fixed_dirs, run_dirs, run_id,
                timestamp, prot_db_name, project_id):
    """Re-annotate contig and extract reference segments using coordinates."""
    # set inputs and outputs
    run_root = r_root_dir + run_id + "/"
    ref_name = ref['name']
    in_file = fixed_dirs['ori_g_dir'] + ref['file']
    seg_out_root = run_root + run_dirs['ref_seg_dir'] + ref_name + "/"
    gen_fas_root = fixed_dirs['fas_contigs_dir'] + ref_name + "/"
    if ref_annot_flag:
        ref_gbk = run_root + run_dirs[
            'ref_gbk_dir'] + ref_name + "_re-annot.gbk"
    else:  ## bypass re-annotated ONLY IF ORIGINAL INPUT IS GBK #todo: fix
        ref_gbk = in_file
    ref_fas = run_root + run_dirs['ref_fas_dir'] + ref_name + ".fas"
    genome_fas = gen_fas_root + ref_name + "_1.fas"
    report_root = run_root + run_dirs['reports'] + ref_name + "/"
    ref_log = report_root + run_id + "_" + ref_name + "_log.txt"
    ensure_dir([seg_out_root, report_root, gen_fas_root])
    print " ", ref_name, "...",
    # initialize run_ref object
    run_ref = Reference(ref_name, in_file, ref['input'], ref['seg_mode'],
                        ref['capture'], ref_fas, ref_gbk, seg_out_root,
                        ref_log)
    # initialize reference log
    cl_header = ["# Console log:", run_id, "/", ref_name, timestamp, "\n\n"]
    open(ref_log, 'w').write(" ".join(cl_header))
    # open record and ensure we have a fasta in the right place
    if not path.exists(ref_fas):
        if run_ref.input == 'fas':
            copyfile(in_file, ref_fas)
        elif run_ref.input == 'gbk':
            record = load_genbank(in_file)
            record.id = ref_name
            write_fasta(ref_fas, record)
        else:
            msg = "ERROR: Input not recognized for " + ref_name
            run_ref.log(msg)
            raise Exception(msg)
    # make a BLAST DB
    make_ref_DB(ref, run_id, fixed_dirs, r_root_dir, run_dirs)
    copyfile(ref_fas, genome_fas)
    # re-annotate ref contig
    if ref_annot_flag:
        record = annot_ref(ref_name, ref_fas, prot_db_name, fixed_dirs,
                           project_id)
    else:  ## bypass re-annotation ONLY IF ORIGINAL INPUT IS GBK #todo: fix
        record = load_genbank(in_file)
    # load or generate segment definitions
    if run_ref.seg_mode == 'chop':
        run_ref.get_segs_from_chop(len(record.seq), ref['chop_size'])
    elif run_ref.seg_mode == 'list':
        run_ref.get_segs_from_list(ref['segs'])
    elif run_ref.seg_mode == 'feats':
        run_ref.get_segs_from_feats(ref['feat_type'])
    # extract segment sequences
    rec_annot = run_ref.extract_segs_seqs(record, seg_out_root)
    # write re-annotated reference sequence to file
    write_genbank(ref_gbk, rec_annot)
    # report results
    logstring = " ".join([str(len(run_ref.segs)), "segments"])
    print logstring
    run_ref.log(logstring)
    return run_ref
Exemplo n.º 11
0
def unpack_genomes(genome, separator, fixed_dirs, ctg_thresholds):
    """Unpack genome files.

    Here, unpacking means extracting data and producing specific files to
    standardize how the information is made available to downstream analysis.
    Depending on the input file format, different unpacking methods are
    invoked. In all cases, this ensures that for each genome, there is a
    multifasta file of the contigs all together as well as a separate Genbank
    file for each contig.

    Supported input file formats are the following:
    - mfas: Basic whole genome sequence in multifasta file of contigs. This
    can be used to process a finished genome in a single Fasta file as well.
    - cgbk: All contigs concatenated in a single GenBank file (Genoscope,
    French WGS). This can be used to process a finished genome in a single
    GanBank file as well.
    # TODO: provide support for other possible input formats

    Unpacking 'cgbk' genomes involves an initial step to detect occurrences
    of the sequence separator and collect the start and stop coordinates of
    each contig. Each pair of coordinates can then be used to extract the
    contig sequence and create a SeqRecord for that contig, which SeqIO
    normally does when it unpacks multifasta files.

    """
    # set up inputs
    infile = genome['file'] #TODO: make GUI input loader (upstream)
    inpath = fixed_dirs['ori_g_dir']+infile
    g_name = genome['name']
    print " ", g_name, "...",
    # prep output destinations
    mfas_dir = fixed_dirs['mfas_contigs_dir']
    fas_dir = fixed_dirs['fas_contigs_dir']+g_name+"/"
    ensure_dir([mfas_dir, fas_dir])
    mfas_file = mfas_dir+g_name+"_contigs.fas"
    records = []
    # select unpacking method
    if genome['input'] is 'fas':
        try: path.exists(inpath) is True
        except ValueError: raise Exception("Bad input file path")
        genome_recs = load_multifasta(inpath)
        # generate GenBank files
        counter = 0
        for rec in genome_recs:
            counter +=1
            ctg_num = str(counter)
            new_id = g_name+"_"+ctg_num  # workaround for long ids
            new_seq = rec.seq
            new_seq.alphabet = generic_dna
            new_rec = SeqRecord(seq=new_seq, id=new_id)
            records.append(new_rec)  # for multifasta output
            fas_file = fas_dir+new_id+".fas"
            write_fasta(fas_file, new_rec)
    elif genome['input'] is 'gbk':
        # load in genome data
        genome_rec = load_genbank(inpath)
        g_string = genome_rec.seq
        # find split coordinates
        coord_pairs = multisplit_finder(g_string, separator)
        # split record
        counter = 0
        for (start, stop) in coord_pairs:
            counter +=1
            ctg_num = str(counter)
            new_record = genome_rec[start:stop]
            new_record.id = g_name+"_"+ctg_num
            records.append(new_record)  # for multifasta output
            fas_file = fas_dir+g_name+"_"+ctg_num+".fas"
            write_fasta(fas_file, new_record)
    else:
        xmsg = "Input file format "+genome['input']+" unspecified/unsupported"
        raise Exception(xmsg)
    print counter, "contigs"
    # write master file
    write_fasta(mfas_file, records)
    # pass records to stats logger
    ctg_stats(g_name, fixed_dirs, ctg_thresholds, records)
Exemplo n.º 12
0
def process_ref(ref, ref_annot_flag, r_root_dir, fixed_dirs, run_dirs,
                run_id, timestamp, prot_db_name, project_id):
    """Re-annotate contig and extract reference segments using coordinates."""
    # set inputs and outputs
    run_root = r_root_dir+run_id+"/"
    ref_name = ref['name']
    in_file = fixed_dirs['ori_g_dir']+ref['file']
    seg_out_root = run_root+run_dirs['ref_seg_dir']+ref_name+"/"
    gen_fas_root = fixed_dirs['fas_contigs_dir']+ref_name+"/"
    if ref_annot_flag:
        ref_gbk = run_root+run_dirs['ref_gbk_dir']+ref_name+"_re-annot.gbk"
    else: ## bypass re-annotated ONLY IF ORIGINAL INPUT IS GBK #todo: fix
        ref_gbk = in_file
    ref_fas = run_root+run_dirs['ref_fas_dir']+ref_name+".fas"
    genome_fas = gen_fas_root+ref_name+"_1.fas"
    report_root = run_root+run_dirs['reports']+ref_name+"/"
    ref_log = report_root+run_id+"_"+ref_name+"_log.txt"
    ensure_dir([seg_out_root, report_root, gen_fas_root])
    print " ", ref_name, "...",
    # initialize run_ref object
    run_ref = Reference(ref_name, in_file, ref['input'], ref['seg_mode'],
                        ref['capture'], ref_fas, ref_gbk, seg_out_root,
                        ref_log)
    # initialize reference log
    cl_header = ["# Console log:", run_id, "/", ref_name, timestamp, "\n\n"]
    open(ref_log, 'w').write(" ".join(cl_header))
    # open record and ensure we have a fasta in the right place
    if not path.exists(ref_fas):
        if run_ref.input == 'fas':
            copyfile(in_file, ref_fas)
        elif run_ref.input == 'gbk':
            record = load_genbank(in_file)
            record.id = ref_name
            write_fasta(ref_fas, record)
        else:
            msg = "ERROR: Input not recognized for "+ref_name
            run_ref.log(msg)
            raise Exception(msg)
    # make a BLAST DB
    make_ref_DB(ref, run_id, fixed_dirs, r_root_dir, run_dirs)
    copyfile(ref_fas, genome_fas)
    # re-annotate ref contig
    if ref_annot_flag:
        record = annot_ref(ref_name, ref_fas, prot_db_name, fixed_dirs,
                           project_id)
    else: ## bypass re-annotation ONLY IF ORIGINAL INPUT IS GBK #todo: fix
        record = load_genbank(in_file)
    # load or generate segment definitions
    if run_ref.seg_mode == 'chop':
        run_ref.get_segs_from_chop(len(record.seq), ref['chop_size'])
    elif run_ref.seg_mode == 'list':
        run_ref.get_segs_from_list(ref['segs'])
    elif run_ref.seg_mode == 'feats':
        run_ref.get_segs_from_feats(ref['feat_type'])
    # extract segment sequences
    rec_annot = run_ref.extract_segs_seqs(record, seg_out_root)
    # write re-annotated reference sequence to file
    write_genbank(ref_gbk, rec_annot)
    # report results
    logstring = " ".join([str(len(run_ref.segs)), "segments"])
    print logstring
    run_ref.log(logstring)
    return run_ref
Exemplo n.º 13
0

if __name__ == "__main__":

    blast_results = parse_blast_output(BLAST_OUTPUT)

    id_to_sequence = parse_fasta(PDB_SEQS_STRUCTURE)
    ref_to_sequence = parse_fasta(REFERENCE_PROTEOME)

    ordered_blast_results = order_hits(blast_results)
    combined_blast_results = combine_all_hits(ordered_blast_results)
    best_hits = get_best_hits(combined_blast_results)

    for query, best_hit in best_hits.items():
        fasta_dict = {}
        ref_id = best_hit[1]
        fasta_dict[query] = id_to_sequence[query]
        fasta_dict[ref_id] = ref_to_sequence[ref_id]

        temp_fasta = f'{TEMP}{query}.fasta'
        write_fasta(fasta_dict, temp_fasta)
        temp_aligned = f'{TEMP}{query}_aligned.fasta'
        #       if not os.path.isfile(temp_aligned):
        run_muscle(temp_fasta, temp_aligned)
        aligned_fasta_dict = parse_fasta(temp_aligned)
        mapping = make_mapping(aligned_fasta_dict)
        mapping_out = f'{MAPPING_DIR}{query}_{ref_id}.txt'
        write_residue_mapping(mapping, mapping_out)

#       print(query, best_hit[1], best_hit[0].qcov, best_hit[0].scov, best_hit[0].worst_eval, best_hit[0].smallest_ident)
Exemplo n.º 14
0
def build_scaffolds(run_ref, r_root_dir, run_dirs, prox_D, separator,
                    genomes, run_id, timestamp, mtype, mode):
    """Build a scaffold of contigs based on the reference.

    This takes contigs that gave positive hits when blasted with reference
    segments. The contigs were aligned against the complete reference in a
    previous step for mapping purposes. Now the output of that step is re-used
    determine their position. A caveat is that if there are natural local
    rearrangements in the sequence relative to the reference, they may not be
    resolved appropriately. The problem is somewhat moderated by the fact that
    this function takes the best (usually the largest) hit region as "anchor"
    to position the contig within the scaffold. But if the rearranged region
    takes up a significant portion of the contig length, the anchoring will
    probably not be called correctly. Visual inspection of the finalized
    maps should help diagnose any such problems. The order can be fixed
    manually using the Mauve Contig Mover, which is part of Mauve 2.

    Note that not all hit contigs are "real" hits, so filtering should be
    applied before scaffolding to generate constructs.

    Model-based filtering produces a list of contigs that will be passed to
    the scaffolder. If filtering manually by looking at the maps,
    there are two options available: either select exclusively OR exclude a
    subset of contigs for the scaffolding process. This is done by listing
    their ID number in the genome dictionaries in the config file then
    resuming the pipeline from this step.

    """
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    ctgs_root = run_root+run_dirs['run_gbk_ctgs_dir']+ref_n+"/"
    mauve_root = run_root+run_dirs['mauve_out_dir']+ref_n+"/contigs/"
    scaffolds_dir = run_root+run_dirs['scaffolds_dir']+ref_n+"/"
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Build scaffold constructs @", timestamp, "\n"])
    run_ref.log(logstring)
    # cycle through genomes
    for genome in genomes:
        # set inputs
        g_name = genome['name']
        ctgs_dir = ctgs_root+g_name+"/"
        print "\t", g_name, "...",
        # log
        logstring = "".join(["\n", g_name])
        run_ref.log(logstring)
        # set outputs
        mauve_dir = mauve_root+g_name+"/"
        ensure_dir([mauve_dir, scaffolds_dir])
        scaff_fas = scaffolds_dir+g_name+"_"+ref_n+"_scaffold.fas"
        scaff_gbk = scaffolds_dir+g_name+"_"+ref_n+"_scaffold.gbk"
        # list genbank files in matches directory
        dir_contents = listdir(ctgs_dir)
        anchors_array = np.zeros(1, dtype=[('ctg', 'i4'),
                                           ('start', 'i4'),
                                           ('end', 'i4'),
                                           ('orient', 'i2')])
        # identify contigs we want to select
        subset = []
        for item in dir_contents:
            pattern = re.compile(r'.*_(\d*)\.gbk$')
            match = pattern.match(item)
            if match:
                ctg_num = match.group(1)

                if mode == "exclude":
                    try:
                        if int(ctg_num) in genome[mode]:
                            msg = "("+ctg_num+")"
                            print msg,
                            run_ref.log(msg)
                        else:
                            subset.append(ctg_num)
                    except KeyError:
                        msg = "WARNING: no ignored segments list, including all"
                        print msg
                        msg = ctg_num
                        print msg,
                        subset.append(ctg_num)
                        run_ref.log(msg)
                        
                elif mode == "select":
                    try:
                        if int(ctg_num) in genome[mode]:
                            msg = ctg_num
                            print msg,
                            run_ref.log(msg)
                            subset.append(ctg_num)
                        else:
                            msg = "("+ctg_num+")"
                            print msg,
                            run_ref.log(msg)
                    except KeyError:
                        msg = "WARNING: no selected segments list, including all"
                        print msg
                        msg = ctg_num
                        print msg,
                        subset.append(ctg_num)
                        run_ref.log(msg)
                        
        # at this point we should have a subset of contigs selected
        for ctg_num in subset:
            logstring = "".join(["\t", ctg_num])
            run_ref.log(logstring)
            # set inputs
            mauve_file = mauve_dir+ctg_num+".mauve"
            bb_file = mauve_file+".backbone"
            try:
                # parse Mauve output
                coords = mauver_load2_k0(bb_file, prox_D, mtype)
                # determine which segment to use as anchor
                anchor_seg = get_anchor_loc(coords)
                anchors_array = np.insert(anchors_array, 0,
                                          (ctg_num,
                                           anchor_seg['start'],
                                           anchor_seg['end'],
                                           anchor_seg['orient']))
            except IOError:
                msg = "\tERROR: Mauve alignment not found\n\t"
                print msg
                run_ref.log(msg)
            except Exception:
                msg = "\tERROR: Iteration failure\n\t"
                print msg
                run_ref.log(msg)

        # abort if there is no valid contig to proceed with
        try:
            assert len(anchors_array) > 1 # always 1 left from stub
        except AssertionError:
            msg = "\tWARNING: Contig list empty\n\t"
            print msg
            run_ref.log(msg)
        else:
            # order contigs by anchor location
            anchors_array = np.sort(anchors_array, order='start')
            # load contig records from the genbank files in the matches directory
            ctg_list = []
            for ctg_anchor in anchors_array:
                ctg_num = ctg_anchor['ctg']
                if ctg_num > 0:
                    contig_gbk = ctgs_dir+g_name+"_"+str(ctg_num)+".gbk"
                    record = load_genbank(contig_gbk)
                    if ctg_anchor['orient'] == -1: # flip record
                        record = record.reverse_complement(id=True, name=True,
                            annotations=True, description=True)
                    ctg_list.append(record)
                else: # workaround for having 0 value leftover from stub
                    pass # having it might come in handy in later dev
            # output scaffold files
            write_fasta(scaff_fas, ctg_list)
            scaff_record = SeqRecord('', id='temp')
            scaff_bumper = SeqRecord(separator, id='join')
            for record in ctg_list:
                feat_start = len(scaff_record.seq)
                scaff_record += record
                feat_stop = len(scaff_record.seq)
                scaff_record += scaff_bumper
                feat_loc = FeatureLocation(feat_start, feat_stop)
                pattern = re.compile(r'.*_(\d*)$')
                match = pattern.match(record.id)
                try: ctg_num = match.group(1)
                except Exception: ctg_num = 'N'
                feature = SeqFeature(location=feat_loc,
                                     type='contig',
                                     qualifiers={'id': ctg_num})
                scaff_record.features.append(feature)
            scaff_record.description = g_name+" scaffold from "+ref_n
            try:
                scaff_record.id = g_name
                write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper
            except ValueError:
                scaff_record.id = g_name[:10]
                write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper
            print ""
Exemplo n.º 15
0
def glompX_blast_out(genomes, run_ref, blast_mode, r_root_dir, run_dirs,
                     run_id, fixed_dirs, blast_dtypes, references,
                     min_nt_match, min_nt_score, min_nt_idp, min_aa_match,
                     min_aa_score, min_aa_idp, capture_span, timestamp):
    """Collect Blast results and extract match contigs."""
    # load inputs
    ref_n = run_ref.name
    run_root = r_root_dir + run_id + "/"
    match_root = run_root + run_dirs['match_out_dir'] + ref_n + "/"
    capture_root = run_root + run_dirs['capture_dir'] + ref_n + "/"
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Collect Blast results @", timestamp, "\n\n"])
    run_ref.log(logstring)
    # collect results
    ref_hits = {}
    control_scores = []
    run_ref.log("Segs/Gs\t")
    run_ref.log("\t".join([genome['name'] for genome in genomes]))
    for seg in run_ref.segs:
        seg_n = seg['name']
        print "\t", seg_n, "...",
        run_ref.log("".join(["\n", seg_n]))
        blast_dir = run_root + run_dirs[
            'blast_out_dir'] + ref_n + "/" + seg_n + "/"
        capture_dir = capture_root + "/" + seg_n + "/"
        ensure_dir([blast_dir, capture_dir])
        ref_flag = True
        for genome in genomes:
            g_name = genome['name']
            print "|",
            # process
            if g_name not in ref_hits.keys():
                ref_hits[g_name] = {}
            matches_dir = match_root + g_name + "/"
            ensure_dir([matches_dir])
            blast_infile = blast_dir + g_name + "_out.txt"
            genome_ctg_dir = fixed_dirs['fas_contigs_dir'] + g_name + "/"
            rec_array = read_array(blast_infile, blast_dtypes)
            if len(rec_array) > 0:  # take qualified hits
                p_cnt = 0
                n_cnt = 0
                if g_name in [ref['name'] for ref in references]:
                    copyfile(genome_ctg_dir + g_name + "_1.fas",
                             matches_dir + g_name + ".fas")
                    if ref_flag:
                        # positive control TODO: better solution
                        control_scores.append(rec_array[0][11])
                        ref_flag = False
                for line in rec_array:
                    idp = line[2]
                    q_start, q_stop = line[8], line[9]
                    score = line[11]
                    length = abs(q_stop - q_start)
                    # check the blast mode to use the right thresholds
                    if blast_mode == 'n' or blast_mode == 'tx':
                        min_match = min_nt_match
                        min_score = min_nt_score
                        min_idp = min_nt_idp
                    elif blast_mode == 'tn':
                        min_match = min_aa_match
                        min_score = min_aa_score
                        min_idp = min_aa_idp
                    else:  # default to nucleotide mode
                        min_match = min_nt_match
                        min_score = min_nt_score
                        min_idp = min_nt_idp
                    if length > min_match and score > min_score and idp > min_idp:
                        print "+",
                        p_cnt += 1
                        contig_id = line[1]
                        if contig_id not in ref_hits[g_name].keys():
                            ref_hits[g_name][contig_id] = {seg_n: score}
                        else:
                            ref_hits[g_name][contig_id][seg_n] = score
                        pattern = re.compile(r'(' + contig_id + ')\.fas')
                        for item in listdir(genome_ctg_dir):
                            match = re.match(pattern, item)
                            if match:
                                fas_file = matches_dir + match.group(
                                    1) + ".fas"
                                if not path.exists(fas_file):
                                    copyfile(genome_ctg_dir + item, fas_file)
                        # context capture
                        capture_flag = False
                        while True:
                            try:
                                if int(seg_n) in run_ref.capture:
                                    capture_flag = True
                                else:
                                    break
                            except ValueError:
                                if seg_n in run_ref.capture:
                                    capture_flag = True
                                else:
                                    break
                            else:
                                break
                        if capture_flag:
                            # load the sequence
                            contig_file = matches_dir + contig_id + ".fas"
                            contig_rec = load_fasta(contig_file)
                            # check orientation
                            if q_start < q_stop:
                                c_start = q_start - capture_span
                                c_stop = q_stop + capture_span
                            else:
                                c_start = q_stop - capture_span
                                c_stop = q_start + capture_span
                            print c_start, c_stop
                            # check limits
                            if c_start < 0:
                                c_start = 1
                            if c_stop > len(contig_rec.seq):
                                c_stop = len(contig_rec.seq)
                            # proceed
                            cxt_file = capture_dir + g_name + "_" + contig_id + ".fas"
                            cxt_rec = SeqRecord(
                                id=contig_id + "_" + str(c_start) + "_" +
                                str(c_stop),
                                seq=contig_rec.seq[c_start:c_stop])
                            write_fasta(cxt_file, cxt_rec)
                    else:
                        print "-",
                        n_cnt += 1
                if n_cnt > 0:
                    logstring = "".join(
                        ["\t", str(p_cnt), " (",
                         str(n_cnt), ")"])
                else:
                    logstring = "".join(["\t", str(p_cnt)])
                run_ref.log(logstring)
            else:
                print "-",
                run_ref.log("".join(["\t", "0"]))
        print ""
    return ref_hits, control_scores
Exemplo n.º 16
0
def glompX_blast_out(genomes, run_ref, blast_mode, r_root_dir, run_dirs,
                     run_id, fixed_dirs, blast_dtypes, references,
                     min_nt_match, min_nt_score, min_nt_idp, min_aa_match,
                     min_aa_score, min_aa_idp, capture_span, timestamp):
    """Collect Blast results and extract match contigs."""
    # load inputs
    ref_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    match_root = run_root+run_dirs['match_out_dir']+ref_n+"/"
    capture_root = run_root+run_dirs['capture_dir']+ref_n+"/"
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Collect Blast results @", timestamp, "\n\n"])
    run_ref.log(logstring)
    # collect results
    ref_hits = {}
    control_scores = []
    run_ref.log("Segs/Gs\t")
    run_ref.log("\t".join([genome['name'] for genome in genomes]))
    for seg in run_ref.segs:
        seg_n = seg['name']
        print "\t", seg_n, "...",
        run_ref.log("".join(["\n", seg_n]))
        blast_dir = run_root+run_dirs['blast_out_dir']+ref_n+"/"+seg_n+"/"
        capture_dir = capture_root+"/"+seg_n+"/"
        ensure_dir([blast_dir, capture_dir])
        ref_flag = True
        for genome in genomes:
            g_name = genome['name']
            print "|",
            # process
            if g_name not in ref_hits.keys():
                ref_hits[g_name] = {}
            matches_dir = match_root+g_name+"/"
            ensure_dir([matches_dir])
            blast_infile = blast_dir+g_name+"_out.txt"
            genome_ctg_dir = fixed_dirs['fas_contigs_dir']+g_name+"/"
            rec_array = read_array(blast_infile, blast_dtypes)
            if len(rec_array) > 0:  # take qualified hits
                p_cnt = 0
                n_cnt = 0
                if g_name in [ref['name'] for ref in references]:
                    copyfile(genome_ctg_dir+g_name+"_1.fas",
                             matches_dir+g_name+".fas")
                    if ref_flag:
                        # positive control TODO: better solution
                        control_scores.append(rec_array[0][11])
                        ref_flag = False
                for line in rec_array:
                    idp = line[2]
                    q_start, q_stop = line[8], line[9]
                    score = line[11]
                    length = abs(q_stop-q_start)
                    # check the blast mode to use the right thresholds
                    if blast_mode == 'n' or blast_mode == 'tx':
                        min_match = min_nt_match
                        min_score = min_nt_score
                        min_idp = min_nt_idp
                    elif blast_mode == 'tn':
                        min_match = min_aa_match
                        min_score = min_aa_score
                        min_idp = min_aa_idp
                    else: # default to nucleotide mode
                        min_match = min_nt_match
                        min_score = min_nt_score
                        min_idp = min_nt_idp
                    if length>min_match and score>min_score and idp>min_idp:
                        print "+",
                        p_cnt +=1
                        contig_id = line[1]
                        if contig_id not in ref_hits[g_name].keys():
                            ref_hits[g_name][contig_id] = {seg_n: score}
                        else:
                            ref_hits[g_name][contig_id][seg_n] = score
                        pattern = re.compile(r'('+contig_id+')\.fas')
                        for item in listdir(genome_ctg_dir):
                            match = re.match(pattern, item)
                            if match:
                                fas_file = matches_dir+match.group(1)+".fas"
                                if not path.exists(fas_file):
                                    copyfile(genome_ctg_dir+item, fas_file)
                        # context capture
                        capture_flag = False
                        while True:
                            try:
                                if int(seg_n) in run_ref.capture:
                                    capture_flag = True
                                else:
                                    break
                            except ValueError:
                                if seg_n in run_ref.capture:
                                    capture_flag = True
                                else:
                                    break
                            else:
                                break
                        if capture_flag:
                            # load the sequence
                            contig_file = matches_dir+contig_id+".fas"
                            contig_rec = load_fasta(contig_file)
                            # check orientation
                            if q_start < q_stop:
                                c_start = q_start-capture_span
                                c_stop = q_stop+capture_span
                            else:
                                c_start = q_stop-capture_span
                                c_stop = q_start+capture_span
                            print c_start, c_stop
                            # check limits
                            if c_start < 0:
                                c_start = 1
                            if c_stop > len(contig_rec.seq):
                                c_stop = len(contig_rec.seq)
                            # proceed
                            cxt_file = capture_dir+g_name+"_"+contig_id+".fas"
                            cxt_rec = SeqRecord(id=contig_id+"_"
                                                    +str(c_start)+"_"
                                                    +str(c_stop),
                                                seq=contig_rec.seq
                                                    [c_start:c_stop])
                            write_fasta(cxt_file, cxt_rec)
                    else:
                        print "-",
                        n_cnt +=1
                if n_cnt > 0:
                    logstring = "".join(["\t", str(p_cnt), " (",
                                         str(n_cnt), ")"])
                else:
                    logstring = "".join(["\t", str(p_cnt)])
                run_ref.log(logstring)
            else:
                print "-",
                run_ref.log("".join(["\t", "0"]))
        print ""
    return ref_hits, control_scores
#!/usr/bin/env python

from sys import argv
from parsers import parse_fasta, LongFastaID
from writers import write_fasta

if __name__ == "__main__":
    fasta = argv[1]
    out_file = argv[2]

    fasta_dict = parse_fasta(fasta)
    prot_id_to_seq = {}

    for fasta_id, sequence in fasta_dict.items():
        fasta_id = LongFastaID(fasta_id)
        prot_id = fasta_id.protein_id
        assert prot_id not in prot_id_to_seq
        prot_id_to_seq[prot_id] = sequence

    write_fasta(prot_id_to_seq, out_file)

        
Exemplo n.º 18
0
from parsers import parse_fasta, LongFastaID
from writers import write_fasta

from sys import argv

REFACC = "NC_045512.2"


def separate_ref_from_nonref(fasta_dir):
    ref_fasta_dict = {}
    nonref_fasta_dict = {}

    fasta_dict = parse_fasta(fasta_dir)
    for seq_id, sequence in fasta_dict.items():
        seq_id = LongFastaID(seq_id)
        if seq_id.genome_acc == REFACC:
            ref_fasta_dict[seq_id.protein_id] = sequence
        else:
            nonref_fasta_dict[seq_id.protein_id] = sequence

    return ref_fasta_dict, nonref_fasta_dict


if __name__ == "__main__":
    fasta_dir = argv[1]

    ref_fasta_dict, nonref_fasta_dict = separate_ref_from_nonref(fasta_dir)
    write_fasta(ref_fasta_dict, 'reference_proteome.fasta')
    write_fasta(nonref_fasta_dict, 'non-reference_covid19_proteins.fasta')
Exemplo n.º 19
0
            
if __name__ == "__main__":
    blast_output = argv[1]
    covid19_fasta = argv[2]
    orf_mapping = argv[3]

    covid19_fasta_dict = parse_fasta(covid19_fasta)
    orf_mapping = parse_mapping(orf_mapping)

    queryid_to_hits = parse_blast_output(blast_output)
    sorted_hit_dict = order_hits(queryid_to_hits)
    combined_hit_dicts = {}
    for query, subject_to_hits in sorted_hit_dict.items():
        combined_hit_dict = combine_hits(subject_to_hits)
        combined_hit_dicts[query] = combined_hit_dict

    filtered_hits, identical_hits, rejected_hits = filter_hits(combined_hit_dicts, covid19_fasta_dict)
 #   print(identical_hits.items())
    
    subject_to_queries = map_subject_to_queries(filtered_hits)

    fasta_dicts = make_fasta_dicts(subject_to_queries, covid19_fasta_dict, orf_mapping)

    for protein_name, fasta_dict in fasta_dicts.items():
        file_name = protein_name + '.fasta'
        seq_to_id = make_seq_to_id_dict(fasta_dict, identical_hits, covid19_fasta_dict)
        id_to_seq = make_new_fasta_dict(seq_to_id)
        write_fasta(id_to_seq, file_name)

    
from writers import write_fasta
from sys import argv


def get_refseqs(refseq_to_uniprot):
    refseqs = set([])
    for refseq in refseq_to_uniprot:
        refseqs.add(refseq.split('.')[0])

    return refseqs


if __name__ == "__main__":
    fasta = argv[1]
    refseqs = argv[2]

    refseq_to_uniprot = parse_mapping(refseqs)
    refseqs = get_refseqs(refseq_to_uniprot)

    fasta_dict = parse_fasta(fasta)
    refseq_to_seq = {}

    for fasta_id, sequence in fasta_dict.items():
        fasta_id = fasta_id.split('|')[0]
        print(fasta_id)
        fasta_id = fasta_id.strip()
        if fasta_id in refseqs:
            refseq_to_seq[fasta_id] = sequence

    write_fasta(refseq_to_seq, 'reference_proteome_complete.fasta')
Exemplo n.º 21
0
    sequence_to_id = {}
    for fasta_id, sequence in fasta_dict.items():
        fasta_id = parse_fasta_id(fasta_id)
        if not sequence in sequence_to_id:
            sequence_to_id[sequence] = []

        sequence_to_id[sequence].append(fasta_id)

    return sequence_to_id


def assign_code(sequence_to_id):
    code_to_sequence = {}
    code_to_accession = {}
    for i, (sequence, accessions) in enumerate(sequence_to_id.items()):
        code = 'seq_%.4d' % i
        code_to_sequence[code] = sequence
        code_to_accession[code] = accessions

    return code_to_sequence, code_to_accession


if __name__ == "__main__":
    fasta = argv[1]

    id_to_sequence = parse_fasta(fasta)
    sequence_to_id = reverse_fasta_dict(id_to_sequence)
    code_to_sequence, code_to_accession = assign_code(sequence_to_id)
    write_fasta(code_to_sequence, UNIQUE_SEQ_DIR)
    write_code_to_accession(code_to_accession, CODE_DIR)