Exemplo n.º 1
0
    def seq_concat_id_fa(self, input_file_path, output_file_path):
        input = fa.SequenceSource(input_file_path)
        output = open(output_file_path, "w")

        while input.next():
            output.write(input.id + "#" + input.seq + "\n")
        output.close()
Exemplo n.º 2
0
def start_fasta_single(infile):
    """
    Check defline format
    >seqid|otherstuff
    Check sequences for ATGC only
    """

    f = fastalib.SequenceSource(infile)
    count = 1
    while f.next():
        line_no = count * 2
        id = f.id.split()[0].split('|')[
            0]  # <space> and bar '|' are the ONLY two dividers allowed
        #print id
        seq = f.seq
        #print seq
        if re.search(id_pattern, id):
            errors.append('ERROR: SeqID (' + id +
                          ') contains bad character(s) about line ' +
                          str(line_no))
        if re.search(sequence_pattern, seq):
            errors.append('ERROR: Sequence (id=' + id +
                          ') contains bad character(s) on about line ' +
                          str(line_no))

        count += 1

    if len(errors) > 1:
        return errors
    else:
        return notes + ['OK -- File Validates']
Exemplo n.º 3
0
    def unsplit_fa(self, input_file_path, output_file_path):
        input = fa.SequenceSource(input_file_path)
        output = fa.FastaOutput(output_file_path)

        while input.next():
            output.store(input, split=False)
        output.close()
Exemplo n.º 4
0
    def cut_region(self, input_file_path):
        input = fa.SequenceSource(input_file_path)
        self.total_seq = 0

        while input.next():
            self.total_seq += 1
            self.get_region(input, self.args.forward_primer,
                            self.args.distal_primer)
Exemplo n.º 5
0
 def parse_input(self):
     print "self.compressed = "
     print self.compressed
     fasta = fa.SequenceSource(self.filename, self.compressed)
     while fasta.next():
         fasta.seq = fasta.seq.upper()
         self.number_of_sequences += 1
         id = self.parse_taxonomy(fasta.id)
         self.parse_seq(id, fasta.seq)
Exemplo n.º 6
0
    def read_file_and_collect_info(self, in_fa_gz_file_name):
        print in_fa_gz_file_name
        input = fa.SequenceSource(in_fa_gz_file_name)

        while input.next():
            t0 = utils.benchmark_w_return_1("parse_id")
            locus = self.parse_id(input.id)
            utils.benchmark_w_return_2(t0, "parse_id")

            self.sequences[locus.strip()] = input.seq.strip()
Exemplo n.º 7
0
 def get_out_file_names(self):
   print "get_out_file_names"
   n = 0
   f_input  = fastalib.SequenceSource(inputfile)
   while f_input.next():
     n+=1
     if (n % 100000 == 0 or n == 1):
       sys.stderr.write('\r[demultiplex] Reading FASTA into memory: %s\n' % (n))
       sys.stderr.flush()
     f_out_name = self.make_file_name(f_input.id)
     self.out_file_names.add(f_out_name)
Exemplo n.º 8
0
def go_single(args):
    """
        reads input fa file
        finds frequencies if present and expands 
        writes out  SEQFILE_CLEAN.fa in same directory
    """

    #sys.path.append('/groups/vampsweb/'+args.site+'/seqinfobin/merens-illumina-utils/')
    import IlluminaUtils.lib.fastalib as fastalib
    infile = args.infile

    print args.infile
    unique = False
    # should not unique until separated into datasets!!
    f = fastalib.SequenceSource(infile, unique=unique)
    pcounter = 0

    datasets = {}
    file_handles = {}
    fh = open(args.outfile, 'w')
    cnt = '1'
    while f.next():

        defline_items = f.id.split('|')
        id_clean = defline_items[0].split()[0]
        freq = defline_items[-1]
        seq_clean = f.seq.upper().strip()
        #print freq

        if freq[:4] == 'freq':
            try:
                cnt = freq.split(':')[1]
            except:
                try:
                    cnt = freq.split('=')[1]
                except:
                    cnt = '1'
        if RepresentsInt(cnt):
            for i in range(1, int(cnt) + 1):
                id = id_clean + '_' + str(i)
                if args.stdout:
                    print '>' + id + '\n' + seq_clean
                else:
                    fh.write('>' + id + '\n' + seq_clean + '\n')

        else:
            if args.stdout:
                print '>' + id_clean + '\n' + seq_clean
            else:
                fh.write('>' + id_clean + '\n' + seq_clean + '\n')

    fh.close()
Exemplo n.º 9
0
    def get_chimeric_ids(self, file_name):
        ids = set()
        print("Get ids from %s" % file_name)
        # todo: benchmark
        # read_fasta     = fa.ReadFasta(file_name)
        # # ids.update(set(read_fasta.ids))
        # ids = set(read_fasta.ids)
        chimeric_fasta = fa.SequenceSource(file_name, lazy_init=False)

        while next(chimeric_fasta):
            ids.add(chimeric_fasta.id)
        chimeric_fasta.close()
        return ids
Exemplo n.º 10
0
    def combine_w_gast_fa(self, input_file_path, output_file_path):
        output = fa.FastaOutput(output_file_path)

        fa_input = fa.SequenceSource(input_file_path)
        gast_file_name = input_file_path + ".gast"
        while fa_input.next():
            file = open(gast_file_name, "r")
            gast_file_content = file.readlines()
            res = self.lines_that_contain(fa_input.id, gast_file_content)
            gast_taxonomy = res[0].split("\t")
            id_gast = fa_input.id + "|" + gast_taxonomy[1]
            fa_input.id = id_gast
            output.store(fa_input, split=False)
        output.close()
Exemplo n.º 11
0
 def demultiplex_input(self, inputfile):
   print "demultiplex_input"
   f_input  = fastalib.SequenceSource(inputfile)
   i = 0
   while f_input.next():
     i += 1
     id = f_input.id
     
     f_out_name = self.make_file_name(f_input.id)
     f_output   = self.out_files[f_out_name]
     self.write_id(f_output, id)
     self.write_seq(f_output, f_input.seq)
     if (i % 100000 == 0 or i == 1):
       sys.stderr.write('\r[demultiplex] Writing entries into files: %s\n' % (i))
       sys.stderr.flush()
Exemplo n.º 12
0
    def move_out_chimeric(self):
        chimeric_ids = self.get_chimeric_ids()
        for idx_key in self.input_file_names:
            fasta_file_path    = os.path.join(self.indir, self.input_file_names[idx_key])   
            read_fasta         = fa.ReadFasta(fasta_file_path)
            read_fasta.close()
            
            non_chimeric_file  = fasta_file_path + self.nonchimeric_suffix
            non_chimeric_fasta = fa.FastaOutput(non_chimeric_file)

            fasta              = fa.SequenceSource(fasta_file_path, lazy_init = False) 
            while fasta.next():
                if not fasta.id in chimeric_ids:
                    non_chimeric_fasta.store(fasta, store_frequencies = False)
            non_chimeric_fasta.close()
Exemplo n.º 13
0
    def write_clean_abundance_file(self):
        """
        Writes the abundance file from the new names file and new unique file.
           These files have already had their ids checked from the deleted file
        """
        for lane_key in self.lane_keys:
            original_abundance_file = os.path.join(self.trim_dir,
                                                   lane_key + ".abund.fa")
            new_abundance_file = os.path.join(self.trim_dir,
                                              lane_key + ".newabund.fa")
            new_names_file = os.path.join(self.trim_dir, lane_key + ".names")
            new_unique_file = os.path.join(self.trim_dir,
                                           lane_key + ".unique.fa")
            names = {}
            uniques = {}

            deleted_id_list = self.deleted_ids[lane_key]
            if len(deleted_id_list) == 0:
                continue

            newnames_fh = open(new_names_file, "r")
            for line in newnames_fh.readlines():
                lst = line.strip().split()

                names[lst[0]] = lst[1].split(',')
            #print(names)
            fasta = fa.SequenceSource(new_unique_file)

            while fasta.next():
                fasta.id
                uniques[fasta.seq] = fasta.id
            #print(uniques)
            sorted_uniques = mysort(uniques, names)

            for item in sorted_uniques:
                read_id = item[0]
                count = item[1]
                seq = item[2]

                sfastaRead = read_id + ";size=" + str(count)
                abundfa = sfasta(sfastaRead, seq)
                abundfa.write(new_abundance_file, 'a')

            # rename to newuniques => uniques
            os.rename(
                original_abundance_file,
                os.path.join(self.trim_dir, lane_key + ".abund_dirty.fa"))
            os.rename(new_abundance_file, original_abundance_file)
Exemplo n.º 14
0
    def move_out_chimeric(self):
        txt_ids = self.get_chimeric_ids(
            os.path.join(self.dir_name, self.chimeric_file_name_txt))
        db_ids = self.get_chimeric_ids(
            os.path.join(self.dir_name, self.chimeric_file_name_db))
        all_chimeric_ids = set(txt_ids) | set(db_ids)
        print("len(all_chimeric_ids) = ")
        print(len(all_chimeric_ids))

        non_chimeric_fasta = fa.FastaOutput(
            os.path.join(self.dir_name, self.output_file_name))
        orig_fasta = fa.SequenceSource(os.path.join(self.dir_name,
                                                    self.chg_file),
                                       lazy_init=False)

        while next(orig_fasta):
            if not orig_fasta.id in all_chimeric_ids:
                non_chimeric_fasta.store(orig_fasta, store_frequencies=False)
        non_chimeric_fasta.close()
Exemplo n.º 15
0
    def write_clean_uniques_file(self):
        """
        Write out a new unique file with all the deleted ids removed
           especially the chimeras which were detected after the original unique file
           was created.
        """
        for lane_key in self.lane_keys:

            deleted_id_list = []
            new_unique_file_name = os.path.join(self.trim_dir,
                                                lane_key + ".newunique.fa")
            new_unique_file = fa.FastaOutput(new_unique_file_name)
            original_unique_file = os.path.join(self.trim_dir,
                                                lane_key + '.unique.fa')

            deleted_id_list = self.deleted_ids[lane_key]
            if len(deleted_id_list) == 0:
                continue

            # open unique file and read a line
            uniquesfasta = fa.SequenceSource(original_unique_file)
            while uniquesfasta.next():
                #print(uniquesfasta.id,self.orphans[lane_key])

                if uniquesfasta.id in self.orphans[lane_key].keys():
                    #print("found orphan",uniquesfasta.id)
                    uniquesfasta.id = self.orphans[lane_key][
                        uniquesfasta.id][0]
                    #print("new id",uniquesfasta.id)
                if uniquesfasta.id not in deleted_id_list:
                    new_unique_file.store(uniquesfasta)
            new_unique_file.close()

            # rename to newuniques => uniques
            os.rename(
                original_unique_file,
                os.path.join(self.trim_dir, lane_key + ".unique_dirty.fa"))
            os.rename(new_unique_file_name, original_unique_file)
Exemplo n.º 16
0
    def write_clean_fasta_file(self):
        """
        def to write a new fasta from the original fasta file
                using the deleted file

        The deleted file contains the trimming deleted as well
        as the chimera deleted
        Then write the uniques from Meren's fastalib
        """
        sleep(2)
        for lane_key in self.lane_keys:
            logger.debug("write_clean_fasta_file working on lanekey: " +
                         lane_key)
            deleted_id_list = []
            original_trimmed_file = os.path.join(self.trim_dir,
                                                 lane_key + ".trimmed.fa")
            new_trimmed_file_name = os.path.join(self.trim_dir,
                                                 lane_key + ".newtrimmed.fa")
            new_trimmed_file = fa.FastaOutput(new_trimmed_file_name)

            # open trimmed file and read a line
            trimmedfasta = fa.SequenceSource(original_trimmed_file)
            logger.debug(
                "write_clean_fasta_file about to check trimmedfasta file")
            deleted_id_list = self.deleted_ids[lane_key]
            if len(deleted_id_list) == 0:
                continue
            while trimmedfasta.next():
                if trimmedfasta.id not in deleted_id_list:
                    new_trimmed_file.store(trimmedfasta)
            new_trimmed_file.close()

            # rename to newtrimmed => trimmed
            os.rename(
                original_trimmed_file,
                os.path.join(self.trim_dir,
                             lane_key + ".trimmed_with_chimera.fa"))
            os.rename(new_trimmed_file_name, original_trimmed_file)
Exemplo n.º 17
0
def start_fasta_multi(infile):
    """
    Check defline format
    >dsname|seqid|otherstuff
    Check sequences for ATGC only
    """

    f = fastalib.SequenceSource(infile)
    datasets_hash = {}
    all_seq_count = 0
    id_has_seq_count = False
    count_style_flip = 0
    while f.next():
        defline = f.id.split()
        if len(defline) > 1:
            #dataset_items = defline[0]
            ds_items = defline[0].split('_')
            #print len(ds_items),ds_items[-1]
            if len(ds_items) > 1:
                try:  # ie: 10056.000009544_123294
                    this_seq_count = int(ds_items[-1])
                    dataset = '_'.join(
                        ds_items[:-1]
                    )  # join in case there were multiple '_' instances
                    if id_has_seq_count == False:
                        count_style_flip += 1
                    id_has_seq_count = True
                except:  # ie: 10056.000009544
                    this_seq_count = 1
                    dataset = defline[0]
                    if id_has_seq_count == True:
                        count_style_flip += 1
                    id_has_seq_count = False
            else:
                this_seq_count = 1
                dataset = defline[0]

            #print dataset
            datasets_hash[dataset] = 1
            id = defline[1]  # <space> and bar '|' are the ONLY two dividers
            seq = f.seq

        else:
            errors.append('ERROR: This file has the wrong format')
            break
        all_seq_count += 1
    #print 'flip',count_style_flip
    if count_style_flip > 1:
        errors.append(
            'ERROR: id style varied from "no count" to "count" too many times')
    #print all_seq_count
    #print len(datasets_hash)
    if all_seq_count == len(datasets_hash):
        errors.append(
            "ERROR: Looks like the number of datasets equals the number of sequences -- that can't be right. Maybe this is a single-dataset style fasta file?"
        )
    else:
        notes.append('Good: dataset count is: ' + str(len(datasets_hash)))
        notes.append('Good: sequence count is: ' + str(all_seq_count))

    if len(errors) > 1:
        return errors
    else:
        return notes + ['OK -- File Validates']
def write_seqfiles(args):
    outdir = args.project_dir
    
    datasets = {}
    files = {}
    stats = {}
    analysis_dir = os.path.join(outdir,'analysis')
    gast_dir = os.path.join(analysis_dir,'gast')
    #gast_dir = os.path.join(outdir,'analysis/gast')
    
    if args.upload_type == 'single':
        ds = args.dataset
        datasets[ds] = 0
        ds_dir = os.path.join(gast_dir,ds)
        if not os.path.exists(ds_dir):
            os.makedirs(ds_dir, mode=0777)
        file = os.path.join(ds_dir,'seqfile.fa')
        fp = open(file,'w')
        files[ds] = fp
    seq_count = 0
    ds_count = 0
    
    f = fastalib.SequenceSource(args.fafile)
    #f = FastaReader(fafile)
    while f.next():
        defline = f.id
        
        if args.upload_type == 'single':
            ds = args.dataset
            # should split on pipe and space
            #id = defline.split('|')[0].split('_')[0]
            id = defline.replace(' ','|').split('|')[0]
            datasets[ds] += 1                
            fp.write('>'+id+"\n"+f.seq+"\n")
        else:    
        
            try:
                #id = defline.replace(' ','|')
                # mobe  defline='>10056.000010538_2 HWI-M00888:59:000000000-A62ET:1:1101:15096:1532 1:N:0:GACCGTAAACTC orig_bc=GACCGTAAACTC new_bc=GACCGTAAACTC bc_diffs=0'
                if 'orig_bc' in defline and 'new_bc' in defline:
                    #if there are orig_bc and new_bc in defline then assume mobe/qiime file
                    #and break up like this:
                    #print 'found mobe defline'
                    tmp = defline.replace(' ','|').split('|')
                    ds = tmp[0].split('_')[0]
                    #id = tmp[1]
                    id = tmp[0].split('_')[1]  
                else:
                    tmp = defline.replace(' ','|').split('|')
                    #print defline
                    ds = tmp[0]
                    id = tmp[1]
                ds_dir = os.path.join(gast_dir,ds)
                
                file = os.path.join(ds_dir,'seqfile.fa')
                if ds in datasets:
                    datasets[ds] +=1
                else:
                    datasets[ds] = 1
                if ds in files:
                    files[ds].write('>'+id+"\n"+f.seq+"\n")
                else:
                    if not os.path.exists(ds_dir):
                        os.makedirs(ds_dir, mode=0777)
                    #os.makedirs(ds_dir)
                    fp = open(file,'w')
                    files[ds] = fp
                    fp.write('>'+id+"\n"+f.seq+"\n")
            except:
                print "Please check the multi-dataset format: ( defline='>" + defline+"' )"
                sys.exit(1)
        
        seq_count += 1
    ds_count = len(datasets)
    f.close()
    #print datasets

    for ds in files:
        files[ds].close()
    stats['seq_count'] = seq_count
    stats['ds_count'] = ds_count
    stats['datasets'] = datasets
    return stats
Exemplo n.º 19
0
args = parse_arguments()
fa_path = args.fa_path
qual_path = args.qual_path
fq_path = args.fq_path
"""  
TODO:
if no qual - use fake and do not process qual_path
File "fasta_to_fastq.py", line 60, in <module>
    f_qual = fa.SequenceSource(qual_path)
  File "/bioware/python-2.7.12-201701011205/lib/python2.7/site-packages/illumina_utils-1.4.8-py2.7.egg/IlluminaUtils/lib/fastalib.py", line 84, in __init__
    self.file_pointer = open(self.fasta_file_path)
TypeError: coercing to Unicode: need string or buffer, NoneType found
"""

f_input = fa.SequenceSource(fa_path)
f_input_dict = make_a_dict(f_input)

if args.qual_path:
    f_qual = fa.SequenceSource(qual_path)
    f_qual_dict = make_a_dict(f_qual)

# print "f_input_dict"
# print f_input_dict
# print "f_qual_dict"
# print f_qual_dict


def convert_qual_scores(line):
    # res = []
    arr = line.split(" ")
Exemplo n.º 20
0
def go_multi(args):
    """
    NO:need qiime map file for ds names only
    and fasta file with defline like so:   >ds|id|frequency:23
    Should create directory structure: analysis/gast/ds for each ds found in seqfile
    
    
    """
    import IlluminaUtils.lib.fastalib as fastalib
    infile = args.infile
    unique = False
    data = {}
    cnt = '1'
    f = fastalib.SequenceSource(infile, unique=unique)
    while f.next():
        defline_items = f.id.split(args.delim)
        dataset = defline_items[0]
        # if ds like M9Dkey217.141053_69
        # must remove the _69 from end
        # but not if like M9Dkey217_141053
        # so:
        test_ds_parts = dataset.split('_')
        if RepresentsInt(test_ds_parts[-1]):
            dataset = '_'.join(test_ds_parts[:-1])
        id = defline_items[1].split()[
            0]  # M01028:102:000000000-AK07B:1:1101:19698:4186 1:N:0:6
        freq = defline_items[-1]
        if dataset not in data:
            data[dataset] = []

        if freq[:4] == 'freq':
            try:
                cnt = freq.split(':')[1]
            except:
                try:
                    cnt = freq.split('=')[1]
                except:
                    cnt = '1'
        data[dataset].append({'id': id, 'seq': f.seq, 'cnt': cnt})

        analysis_dir = 'analysis'
        gast_dir = 'analysis/gast'

        if not os.path.exists(analysis_dir):
            os.makedirs(analysis_dir)
        if not os.path.exists(gast_dir):
            os.makedirs(gast_dir)
        for ds in data:

            if ds != '':
                dir = os.path.join(gast_dir, ds)
                if os.path.exists(dir):
                    shutil.rmtree(dir)
                os.makedirs(dir)

                outfile = os.path.join(dir, args.outfile)
                fh = open(outfile, 'w')
                for dict in data[ds]:

                    cnt = dict['cnt']
                    id = dict['id']
                    seq = dict['seq']
                    if RepresentsInt(cnt):
                        for m in range(1, int(cnt) + 1):
                            idcnt = id + '_' + str(m)
                            if args.stdout:
                                print '>' + idcnt + '\n' + seq
                            else:
                                fh.write('>' + idcnt + '\n' + seq + '\n')
                    else:
                        if args.stdout:
                            print '>' + id + '\n' + seq
                        else:
                            fh.write('>' + id + '\n' + seq + '\n')
                fh.close()

            else:
                print 'Empty ds name!!'
Exemplo n.º 21
0
    def sequences(self, key, tax_collector, read_id_lookup, file_collector):
        """
        fill vamps_sequences.txt file
        
        """

        logging.info("Writing to file: vamps_sequences_pipe")
        if self.runobj.vamps_user_upload or self.runobj.new_vamps_upload:
            project = self.runobj.project
            dataset = key
        else:
            if self.runobj.platform == 'illumina':
                project = self.runobj.samples[key].project
                dataset = self.runobj.samples[key].dataset
            elif self.runobj.platform == '454':
                pass
            else:
                pass

        project = project[0].capitalize() + project[1:]
        project_dataset = project + '--' + dataset
        # open gast_concat table to get the distances and the ferids
        refid_collector = {}
        #if os.path.exists(gast_concat_file):
        for line in open(file_collector['gast_concat_file'], 'r'):
            line = line.strip()
            items = line.split()
            id = items[0]
            distance = items[1]
            refhvr_ids = items[2]
            refid_collector[id] = {}
            refid_collector[id]['distance'] = distance
            refid_collector[id]['refhvr_ids'] = refhvr_ids

        fh = open(file_collector['sequences_file'], 'w')
        fh.write("\t".join([
            "HEADER", "project", "dataset", "taxonomy", "refhvr_ids", "rank",
            "seq_count", "frequency", "distance", "read_id", "project_dataset"
        ]) + "\n")

        # open uniques fa file
        if os.path.exists(file_collector['unique_file']) and os.path.getsize(
                file_collector['unique_file']) > 0:
            f = fastalib.SequenceSource(file_collector['unique_file'])

            while f.next():
                datarow = ['']
                defline_items = f.id.split('|')
                id = defline_items[0]
                cnt = defline_items[1].split(':')[1]
                seq = f.seq
                if id in read_id_lookup:
                    tax = read_id_lookup[id]
                else:
                    tax = ''

                if tax in tax_collector:
                    rank = tax_collector[tax]['rank']
                    #cnt = tax_collector[tax]['knt']
                    freq = tax_collector[tax]['freq']
                else:
                    rank = 'NA'
                    cnt = 0
                    freq = 0

                if id in refid_collector:
                    distance = refid_collector[id]['distance']
                    refhvr_ids = refid_collector[id]['refhvr_ids']
                else:
                    distance = '1.0'
                    refhvr_ids = '0'
                if not cnt:
                    cnt = 1
                datarow.append(seq)
                datarow.append(project)
                datarow.append(dataset)
                datarow.append(tax)
                datarow.append(refhvr_ids)
                datarow.append(rank)
                datarow.append(str(cnt))
                datarow.append(str(freq))
                datarow.append(distance)
                datarow.append(id)
                datarow.append(project_dataset)
                w = "\t".join(datarow)
                #print 'w',w
                fh.write(w + "\n")

            fh.close()
        return refid_collector
Exemplo n.º 22
0
def visualize_sequence_length_distribution(fasta_file_path,
                                           dest,
                                           title,
                                           max_seq_len=None,
                                           xtickstep=None,
                                           ytickstep=None):
    sequence_lengths = []

    fasta = u.SequenceSource(fasta_file_path)

    while next(fasta):
        if fasta.pos % 10000 == 0 or fasta.pos == 1:
            sys.stderr.write('\rReading: %s' %
                             (big_number_pretty_print(fasta.pos)))
            sys.stderr.flush()
        sequence_lengths.append(len(fasta.seq))

    sys.stderr.write('\n')

    if not max_seq_len:
        max_seq_len = max(sequence_lengths) + (int(
            max(sequence_lengths) / 100.0) or 10)

    seq_len_distribution = [0] * (max_seq_len + 1)

    for l in sequence_lengths:
        seq_len_distribution[l] += 1

    fig = plt.figure(figsize=(16, 12))
    plt.rcParams.update({'axes.linewidth': 0.9})
    plt.rc('grid', color='0.50', linestyle='-', linewidth=0.1)

    gs = gridspec.GridSpec(10, 1)

    ax1 = plt.subplot(gs[0:8])
    plt.grid(True)
    plt.subplots_adjust(left=0.05, bottom=0.03, top=0.95, right=0.98)

    plt.plot(seq_len_distribution, color='black', alpha=0.3)
    plt.fill_between(list(range(0, max_seq_len + 1)),
                     seq_len_distribution,
                     y2=0,
                     color='black',
                     alpha=0.15)
    plt.ylabel('number of sequences')
    plt.xlabel('sequence length')

    if xtickstep == None:
        xtickstep = (max_seq_len / 50) or 1

    if ytickstep == None:
        ytickstep = max(seq_len_distribution) / 20 or 1

    plt.xticks(list(range(xtickstep, max_seq_len + 1, xtickstep)),
               rotation=90,
               size='xx-small')
    plt.yticks(list(range(0,
                          max(seq_len_distribution) + 1, ytickstep)),
               size='xx-small')
    plt.ylim(ymin=0,
             ymax=max(seq_len_distribution) +
             (max(seq_len_distribution) / 20.0))
    plt.xlim(xmin=0, xmax=max_seq_len)
    plt.yticks(size='xx-small')

    plt.figtext(0.5,
                0.96,
                '%s' % (title),
                weight='black',
                size='xx-large',
                ha='center')

    ax1 = plt.subplot(gs[9])
    plt.rcParams.update({'axes.edgecolor': 20})
    plt.grid(False)
    plt.yticks([])
    plt.xticks([])
    plt.text(0.02, 0.5, 'total: %s / mean: %.2f / std: %.2f / min: %s / max: %s'\
        % (big_number_pretty_print(len(sequence_lengths)),
           numpy.mean(sequence_lengths), numpy.std(sequence_lengths),\
           big_number_pretty_print(min(sequence_lengths)),\
           big_number_pretty_print(max(sequence_lengths))),\
        va = 'center', alpha = 0.8, size = 'x-large')

    try:
        plt.savefig(dest + '.pdf')
    except:
        plt.savefig(dest + '.png')
Exemplo n.º 23
0
def get_datasets(args):
    """
   
    
    """
    print args
    sys.path.append('/groups/vampsweb/' + args.site +
                    '/seqinfobin/merens-illumina-utils/')
    import IlluminaUtils.lib.fastalib as fastalib
    # errors here are between 240 - 249
    seq_allowed = dict.fromkeys('AGCTUNRYMKSWHBVDagctunrymkswhbvd')
    readid_allowed = dict.fromkeys(
        "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_.:")
    bad_line = False
    dir = os.path.join('/groups/vampsweb/' + args.site + '/tmp/', args.code)
    gast_dir = os.path.join(dir, 'analysis/gast')
    datasets = {}

    out_file = os.path.join(dir, "SEQFILE_CLEAN.FA")
    for infile in os.listdir(args.indir):
        #if fileName[-3:]=='.fa':
        dataset = infile[:-3]
        datasets[dataset] = 0
        file_handles = {}

        new_dir = os.path.join(gast_dir, dataset)
        print new_dir
        os.makedirs(new_dir)
        # open new fa file
        fasta_file = os.path.join(new_dir, 'seqfile.fa')
        fh = open(fasta_file, 'w')
        # write defline and seq

        file_handles[dataset] = fh

        file_path = os.path.join(args.indir, infile)

        #if os.path.exists(infile):

        #import fastalib
        out_fh = open(out_file, 'w')

        # if multiple datasets in fa file then must use raw
        # to be able to get ds and id from defline
        # BUT if single have to assume that id is firat and should use single
        raw_id = False
        unique = False

        raw_id = True

        # should not unique until separated into datasets!!
        f = fastalib.SequenceSource(file_path, unique=unique)
        # defline could be separated by spaces or '|'
        # from uclust otu creation: >Cluster10108;size=1  breaks here
        counter = 0

        while f.next():

            id_clean = f.id

            #print "ID:",id_clean
            if not all(x in seq_allowed for x in f.seq):
                bad_line = True
                msg = 'Sequence failed: ' + f.seq
                sys.exit()
            else:
                seq_clean = f.seq.upper().strip()

                #write to fa file
            file_handles[dataset].write('>' + id_clean + '\n' + seq_clean +
                                        '\n')

            #             else:
            #                 # create new directory in /gast
            #
            #                 new_dir = os.path.join(gast_dir,dataset)
            #                 print new_dir
            #                 os.makedirs(new_dir)
            #                 # open new fa file
            #                 fasta_file = os.path.join(new_dir,'seqfile.fa')
            #                 fh = open(fasta_file,'w')
            #                 # write defline and seq
            #                 fh.write('>'+id_clean+'\n'+seq_clean+'\n')
            #                 file_handles[dataset] = fh
            #                 # save dataset to datasets

            if not bad_line:
                out_fh.write('>' + dataset + ' ' + id_clean + "\n")
                out_fh.write(seq_clean + "\n")
                counter += 1
        datasets[dataset] = counter
        if bad_line:
            print msg
            sys.exit(241)
        if counter == 0:
            print "No sequences found! Remove any empty lines or comments at the top of your file and try again."
            sys.exit(242)
        #print str(counter)+" sequences processed"
        out_fh.close()
    #else:
    #    print "Could not find infile.",file_path
    #    sys.exit(244)

    sequence_count = counter
    print "sequence_count=" + str(sequence_count)
    print 'dir', dir
    print datasets
    return datasets