Exemplo n.º 1
0
def zip_up_directory(run_date, dirPath, mode='a'):
    """
    This should be run at the end of each process to zip the files in each directory
    """
    files_to_compress = ['fa', 'db', 'names', 'sff', 'fasta', 'fastq']
    assert os.path.isdir(dirPath)
    zipFilePath = os.path.join(dirPath, run_date + '.zip')

    zf = zipfile.ZipFile(zipFilePath, mode)

    for (archiveDirPath, dirNames, fileNames) in os.walk(dirPath):
        for file in fileNames:
            if file.split('.')[-1] in files_to_compress:
                filePath = os.path.join(dirPath, file)
                zf.write(filePath, compress_type=zipfile.ZIP_DEFLATED)

    zipInfo = zipfile.ZipInfo(zipFilePath)

    for i in zf.infolist():
        dt = datetime.datetime(*(i.date_time))
        logger.debug("%s\tSize: %sb\tCompressed: %sb\t\tModified: %s" %
                     (i.filename, i.file_size, i.compress_size, dt.ctime()))
        os.remove(i.filename)

    zf.close()
Exemplo n.º 2
0
 def __init__(self, lane_keys, runobj):
     self.inputFileName = {}
     self.orphans = {}
     self.lane_keys = lane_keys
     self.base_dir = runobj.output_dir
     self.trim_dir = os.path.join(self.base_dir, 'analysis/trimming')
     #self.chimera_dir = os.path.join(self.base_dir, 'analysis/chimera')
     self.deleted_ids = {}
     for lane_key in lane_keys:
         self.inputFileName[lane_key] = os.path.join(
             self.trim_dir, lane_key + ".trimmed.fa")
         self.orphans[lane_key] = {}
         deleted_file = os.path.join(self.trim_dir,
                                     lane_key + ".deleted.txt")
         self.deleted_ids[lane_key] = []
         if not (os.path.exists(deleted_file)
                 and os.path.getsize(deleted_file) > 0):
             logger.debug("No deleted sequences for lane: " + lane_key)
             continue
         del_fh = open(deleted_file, "r")
         #deleted_id_list = self.deleted_ids[lane_key] = []
         for line in del_fh.readlines():
             lst = line.strip().split()
             #deleted_id_list.append(lst[0])
             self.deleted_ids[lane_key].append(lst[0])
 def write_clean_fasta_file(self):
     """
     def to write a new fasta from the original fasta file 
             using the deleted file
             
     The deleted file contains the trimming deleted as well
     as the chimera deleted
     Then write the uniques from Meren's fastalib
     """
     sleep(2)
     for lane_key in self.lane_keys:
         logger.debug("write_clean_fasta_file working on lanekey: " + lane_key);
         deleted_id_list = []
         original_trimmed_file   = os.path.join(self.outputdir, lane_key + ".trimmed.fa" )
         new_trimmed_file_name   = os.path.join(self.outputdir, lane_key + ".newtrimmed.fa")
         new_trimmed_file        = FastaOutput(new_trimmed_file_name)
         
         # open trimmed file and read a line             
         trimmedfasta = SequenceSource(original_trimmed_file)
         logger.debug("write_clean_fasta_file about to check trimmedfasta file");
         deleted_id_list = self.deleted_ids[lane_key]
         if len(deleted_id_list) == 0:
             continue
         while trimmedfasta.next():
             if trimmedfasta.id not in deleted_id_list:
                 new_trimmed_file.store(trimmedfasta)
         new_trimmed_file.close()
         
         # rename to newtrimmed => trimmed
         os.rename(original_trimmed_file, os.path.join(self.outputdir, lane_key + ".trimmed_with_chimera.fa" ))
         os.rename(new_trimmed_file_name, original_trimmed_file )
def get_keys(runobj):
    try:
        idx_keys = convert_unicode_dictionary_to_str(json.loads(open(runobj.trim_status_file_name,"r").read()))["new_lane_keys"]
        # {"status": "success", "new_lane_keys": ["1_GATGA"]}
    except:
        # here we have no idx_keys - must create them from run
        # if illumina they are index_runkey_lane concatenation
        # if 454 the are lane_key
        if runobj.vamps_user_upload:
            #print 'KEYS: '+' '.join(runobj.run_keys)
            idx_keys=runobj.samples.keys()
        else:
            if runobj.platform == 'illumina':  
                idx_keys = runobj.idx_keys
                ct = 0
                for h in runobj.samples:
                    logger.debug(h)
#                    logger.debug(h,runobj.samples[h]) #TypeError: not all arguments converted during string formatting
                    ct +=1
            elif runobj.platform == '454':
                idx_keys = runobj.idx_keys
            elif runobj.platform == 'ion_torrent':
                idx_keys = runobj.idx_keys
            else:
                logger.debug("GAST: No keys found - Exiting")
                runobj.run_status_file_h.write("GAST: No keys found - Exiting\n")
                sys.exit()
    if type(idx_keys) is types.StringType:
        return idx_keys.split(',')
    elif type(idx_keys) is types.ListType:
        return idx_keys
    else:
        return None
    return idx_keys
Exemplo n.º 5
0
def trim_anchor_helper(anchor_name, expanded_anchor_sequences, freedom, length,
                       start, sequence):
    exact = ''
    exactTrimmedOff = ''

    logger.debug('looking for anchor: ' + anchor_name + " start: " +
                 str(start) + " length: " + str(length))
    max_divergence = C.max_divergence
    logger.debug('anchor_list: ' + str(expanded_anchor_sequences))
    list_of_tuples = anchortrim.generate_tuples(start,
                                                freedom,
                                                length,
                                                list_of_tuples=[],
                                                reversed_read=False)
    logger.debug('anchor tuples: ' + str(list_of_tuples))
    anchor, location = anchortrim.find_best_distance(
        sequence, expanded_anchor_sequences, max_divergence, list_of_tuples)

    if anchor and location:
        logger.debug('anchor: ' + anchor + ' loc tuple: ' + str(location))
        trimmed_sequence = sequence[:location[
            1]]  # same thing here for the reversed == False
        exact = anchor
        exactTrimmedOff = sequence[location[1]:]
    else:
        logger.debug('no anchor location found')
        trimmed_sequence = sequence
    return exact, exactTrimmedOff, trimmed_sequence
def trim(run):
    # (re) create the trim status file
    run.trim_status_file_h = open(run.trim_status_file_name, "w")
    # do the trim work
    mytrim = TrimRun(run)        
    # pass True to write out the straight fasta file of all trimmed non-deleted seqs
    # Remember: this is before chimera checking
    trim_codes = mytrim.trimrun(True)
    trim_results_dict = {}
    if trim_codes[0] == 'SUCCESS':
        # setup to write the status
        new_lane_keys = trim_codes[2]
        trim_results_dict['status'] = "success"
        trim_results_dict['new_lane_keys'] = new_lane_keys
        logger.debug("Trimming finished successfully")
        # write the data files
        mytrim.write_data_files(new_lane_keys)
        run.trim_status_file_h.write(json.dumps(trim_results_dict))
        run.trim_status_file_h.close()
    else:
        logger.debug("Trimming finished ERROR")
        trim_results_dict['status'] = "error"
        trim_results_dict['code1'] = trim_codes[1]
        trim_results_dict['code2'] = trim_codes[2]
        run.trim_status_file_h.write(json.dumps(trim_results_dict))
        run.trim_status_file_h.close()
        sys.exit()
Exemplo n.º 7
0
def illumina_files(runobj):
    logger.debug("Start Overlap, filter and unique reads")

    utils = PipelneUtils()
    start = time.time()
    #     illumina_files_demultiplex_only(runobj)
    illumina_files_inst = IlluminaFiles(runobj)
    if runobj.do_perfect:
        #         illumina_files_inst.perfect_reads()
        script_file_name = illumina_files_inst.merge_perfect()
        utils.run_until_done_on_cluster(script_file_name)

        script_file_name = illumina_files_inst.trim_primers_perfect()
        utils.run_until_done_on_cluster(script_file_name)

    else:
        #         illumina_files_inst.partial_overlap_reads()
        #         pass
        # TODO: test utils.run_until_done_on_cluster(illumina_files_inst.partial_overlap_reads_cluster())
        # TODO: add cutting to 251
        script_file_name = illumina_files_inst.partial_overlap_reads_cluster()
        utils.run_until_done_on_cluster(script_file_name)

        script_file_name = illumina_files_inst.filter_mismatches_cluster()
        utils.run_until_done_on_cluster(script_file_name)

    #         illumina_files_inst.filter_mismatches()
    #     illumina_files_inst.uniq_fa()
    script_file_name = illumina_files_inst.uniq_fa_cluster()
    utils.run_until_done_on_cluster(script_file_name)
    #     illumina_chimera(runobj)
    elapsed = (time.time() - start)
    logger.debug("illumina_files time = %s" % str(elapsed))
def get_keys(runobj):
    try:
        idx_keys = convert_unicode_dictionary_to_str(json.loads(open(runobj.trim_status_file_name, "r").read()))[
            "new_lane_keys"
        ]
        # {"status": "success", "new_lane_keys": ["1_GATGA"]}
    except:
        # here we have no idx_keys - must create them from run
        # if illumina they are index_runkey_lane concatenation
        # if 454 the are lane_key
        if runobj.platform == "illumina":
            idx_keys = runobj.idx_keys
            ct = 0
            for h in runobj.samples:
                logger.debug(h, runobj.samples[h])
                ct += 1
            print ct
        elif runobj.platform == "454":
            idx_keys = runobj.idx_keys
        elif runobj.platform == "ion_torrent":
            idx_keys = runobj.idx_keys
        elif runobj.platform == "vamps":
            idx_keys = [runobj.user + runobj.run]
        else:
            logger.debug("GAST: No keys found - Exiting")
            runobj.run_status_file_h.write("GAST: No keys found - Exiting\n")
            sys.exit()
    if type(idx_keys) is types.StringType:
        return idx_keys.split(",")
    elif type(idx_keys) is types.ListType:
        return idx_keys
    else:
        return None
    return idx_keys
    def chimera_checking(self, ref_or_novo):
        chimera_region_found = False
        output = {}
        
        for idx_key in self.input_file_names:
#             print "idx_key, self.input_file_names[idx_key] = %s, %s" % (idx_key, self.input_file_names)
            input_file_name  = os.path.join(self.indir,  self.input_file_names[idx_key] + self.chg_suffix)        
            output_file_name = os.path.join(self.outdir, self.input_file_names[idx_key])        
            dna_region       = self.runobj.samples[idx_key].dna_region
#             print "dna_region = %s" % dna_region
            if dna_region in C.regions_to_chimera_check:
                chimera_region_found = True
            else:
                logger.debug('region not checked: ' +  dna_region)
                continue
            
#             print "input_file_name = %s \noutput_file_name = %s" % (input_file_name, output_file_name)
            ref_db     = self.get_ref_db(dna_region)
#             print "dna_region = %s; ref_db = %s; ref_or_novo = %s" % (dna_region, ref_db, ref_or_novo)
            
            uchime_cmd = self.create_chimera_cmd(input_file_name, output_file_name, ref_or_novo, ref_db)
            print "\n==================\n%s command: %s" % (ref_or_novo, uchime_cmd)
            
            try:
                logger.info("chimera checking command: " + str(uchime_cmd))
                output[idx_key] = subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

            except OSError, e:
                print "Problems with this command: %s" % (uchime_cmd)
                if self.utils.is_local():
                    print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e)
                else:
                    print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e)
                    raise                  
def process(runobj, steps):

    requested_steps = steps.split(",")
    if "clean" in requested_steps and len(requested_steps) > 1:
        sys.exit("The clean step cannot be combined with other steps - Exiting")

    # create output directory:
    # this should have been created in pipeline-ui.py. but just in case....
    if not os.path.exists(runobj.output_dir):
        logger.debug("Creating output directory: " + runobj.output_dir)
        os.makedirs(runobj.output_dir)

    # Open run STATUS File here.
    # open in append mode because we may start the run in the middle
    # say at the gast stage and don't want to over write.
    # if we re-run trimming we'll get two trim status reports
    runobj.run_status_file_h = open(runobj.run_status_file_name, "a")

    # loop through official list...this way we execute the
    # users requested steps in the correct order

    for step in C.existing_steps:
        if step in requested_steps:
            # call the method in here
            step_method = globals()[step]
            step_method(runobj)
Exemplo n.º 11
0
    def chimera_checking(self, ref_or_novo):
        chimera_region_found = False
        output = {}
        
        for idx_key in self.input_file_names:
#             print "idx_key, self.input_file_names[idx_key] = %s, %s" % (idx_key, self.input_file_names)
            input_file_name  = os.path.join(self.indir,  self.input_file_names[idx_key] + self.chg_suffix)        
            output_file_name = os.path.join(self.outdir, self.input_file_names[idx_key])        
            dna_region       = self.runobj.samples[idx_key].dna_region
#             print "dna_region = %s" % dna_region
            if dna_region in C.regions_to_chimera_check:
                chimera_region_found = True
            else:
                logger.debug('region not checked: ' +  dna_region)
                continue
            
#             print "input_file_name = %s \noutput_file_name = %s" % (input_file_name, output_file_name)
            ref_db     = self.get_ref_db(dna_region)
#             print "dna_region = %s; ref_db = %s; ref_or_novo = %s" % (dna_region, ref_db, ref_or_novo)
            
            uchime_cmd = self.create_chimera_cmd(input_file_name, output_file_name, ref_or_novo, ref_db)
            self.utils.print_both("\n==================\n%s command: %s" % (ref_or_novo, uchime_cmd))
            
            try:
                logger.info("chimera checking command: " + str(uchime_cmd))
                output[idx_key] = subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

            except OSError, e:
                self.utils.print_both("Problems with this command: %s" % (uchime_cmd))
                if self.utils.is_local():
                    print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e)
                else:
                    print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e)
                    self.utils.print_both("Execution of %s failed: %s" % (uchime_cmd, e))
                    raise                  
def illumina_files(runobj):
    logger.debug("Start Overlap, filter and unique reads")

    utils = PipelneUtils()
    start = time.time()
    #     illumina_files_demultiplex_only(runobj)
    illumina_files_inst = IlluminaFiles(runobj)
    if runobj.do_perfect:
        #         illumina_files_inst.perfect_reads()
        script_file_name = illumina_files_inst.merge_perfect()
        utils.run_until_done_on_cluster(script_file_name)

        script_file_name = illumina_files_inst.trim_primers_perfect()
        utils.run_until_done_on_cluster(script_file_name)

    else:
        #         illumina_files_inst.partial_overlap_reads()
        #         pass
        # TODO: test utils.run_until_done_on_cluster(illumina_files_inst.partial_overlap_reads_cluster())
        # TODO: add cutting to 251
        script_file_name = illumina_files_inst.partial_overlap_reads_cluster()
        utils.run_until_done_on_cluster(script_file_name)

        script_file_name = illumina_files_inst.filter_mismatches_cluster()
        utils.run_until_done_on_cluster(script_file_name)

    #         illumina_files_inst.filter_mismatches()
    #     illumina_files_inst.uniq_fa()
    script_file_name = illumina_files_inst.uniq_fa_cluster()
    utils.run_until_done_on_cluster(script_file_name)
    #     illumina_chimera(runobj)
    elapsed = (time.time() - start)
    logger.debug("illumina_files time = %s" % str(elapsed))
Exemplo n.º 13
0
 def is_local(self):
     curr_hostname = os.uname()[1]
     logger.debug("curr_hostname: %s" % curr_hostname) 
     dev_comps = ["ashipunova.mbl.edu", "as-macbook.home", "as-macbook.local", "Ashipunova.local", "Annas-MacBook-new.local", "Annas-MacBook.local", "Anna's MacBook Pro", "annasmacbooknew.mbl.edu mblad.mbl.edu printers.mbl.edu jbpc.mbl.edu jbpc-np.mbl.edu"] 
     if curr_hostname in dev_comps:
         return True
     else:
         return False
Exemplo n.º 14
0
 def is_vamps(self):
     curr_hostname = os.uname()[1] 
     logger.debug("curr_hostname: %s" % curr_hostname) 
     vamps_comps = ['bpcweb8','bpcweb7','bpcweb7.bpcservers.private', 'bpcweb8.bpcservers.private', 'vampsdev', 'vampsdb']
     if curr_hostname in vamps_comps:
         return True
     else:
         return False
Exemplo n.º 15
0
 def insert_sequence_uniq_info_ill(self, fasta, gast_dict):
     (taxonomy, distance, rank, refssu_count, vote, minrank, taxa_counts, max_pcts, na_pcts, refhvr_ids) = gast_dict[fasta.id]
     sequence_ill_id = self.seq_id_dict[fasta.seq]
     if taxonomy in self.tax_id_dict:
         try:
             taxonomy_id = self.tax_id_dict[taxonomy] 
         except Exception, e:
             logger.debug("Error = %s" % e)
             raise
Exemplo n.º 16
0
 def get_ref_db(self, dna_region):
     ref_db = ''
     if dna_region.upper() == 'ITS':
         logger.debug("got an ITS dna region so using refdb: " + self.its_refdb)
         ref_db = self.its_refdb
     else:
         logger.debug("using standard refdb: " + self.refdb)
         ref_db = self.refdb
     return ref_db       
Exemplo n.º 17
0
def illumina_files_demultiplex_only(runobj):
    logger.debug("Start Demultiplex Illumina files by index/run_key/lane")

    start = time.time()
    illumina_files_inst = IlluminaFiles(runobj)
    illumina_files_inst.open_dataset_files()
    illumina_files_inst.split_files(compressed=runobj.compressed)
    elapsed = (time.time() - start)
    logger.debug("illumina_files demultiplex only time = %s" % str(elapsed))
def illumina_files_demultiplex_only(runobj):
    logger.debug("Start Demultiplex Illumina files by index/run_key/lane")

    start = time.time()
    illumina_files_inst = IlluminaFiles(runobj)
    illumina_files_inst.open_dataset_files()
    illumina_files_inst.split_files(compressed = runobj.compressed)
    elapsed = (time.time() - start)
    logger.debug("illumina_files demultiplex only time = %s" % str(elapsed))
Exemplo n.º 19
0
 def get_ref_db(self, dna_region):
     ref_db = ''
     if dna_region.upper() == 'ITS':
         logger.debug("got an ITS dna region so using refdb: " + self.its_refdb)
         ref_db = self.its_refdb
     else:
         logger.debug("using standard refdb: " + self.refdb)
         ref_db = self.refdb
     return ref_db       
Exemplo n.º 20
0
 def get_ref_db(self, dna_region):
     ref_db = ''
     if dna_region.upper() == 'ITS':
         ref_db = C.chimera_checking_its_refdb
         logger.debug("got an ITS dna region so using refdb: " + ref_db)
     else:
         ref_db = C.chimera_checking_refdb
         if self.utils.is_local():
             ref_db = C.chimera_checking_refdb_local
         logger.debug("using standard refdb: " + ref_db)
     return ref_db
Exemplo n.º 21
0
 def is_vamps(self):
     curr_hostname = os.uname()[1]
     logger.debug("curr_hostname: %s" % curr_hostname)
     vamps_comps = [
         'bpcweb8', 'bpcweb7', 'bpcweb7.bpcservers.private',
         'bpcweb8.bpcservers.private', 'vampsdev', 'vampsdb'
     ]
     if curr_hostname in vamps_comps:
         return True
     else:
         return False
def status(runobj):
    f = open(runobj.run_status_file_name)
    lines = f.readlines()
    f.close()

    logger.debug("=" * 40)
    logger.debug("STATUS LOG: ")
    for line in lines:
        line = line.strip()
        logger.debug("line in run_status_file: ")
        logger.debug(line)
    logger.debug("=" * 40 + "\n")
Exemplo n.º 23
0
def status(runobj):
    f = open(runobj.run_status_file_name)
    lines = f.readlines()
    f.close()

    logger.debug("=" * 40)
    logger.debug("STATUS LOG: ")
    for line in lines:
        line = line.strip()
        logger.debug("line in run_status_file: ")
        logger.debug(line)
    logger.debug("=" * 40 + "\n")
Exemplo n.º 24
0
def trim_fuzzy_distal(anchors_list, seq, trim_type, start, end):
    """Doc string here.."""
    max_distance = 3
    best_distance = max_distance + 1
    found_fuzzy = 0
    fuzzy_match = ""
    for anchor in anchors_list:
        anchor_length = len(anchor)
        for pos in range(start, end):

            seq_window = seq[pos:anchor_length]

            dist = 0

            #dist1 = abs( Levenshtein.ratio( anchor,     seq_window ) )
            # dist2 = abs( Levenshtein.ratio( seq_window, anchor )     )
            dist1 = abs(levenshtein(anchor, seq_window))
            dist2 = abs(levenshtein(seq_window, anchor))
            if dist1 >= dist2: dist = dist1
            else: dist = dist2

            if (dist <= max_distance) and (dist < best_distance) and (
                    seq_window[:2] == anchor[:2]):
                if seq_window[-3:] != anchor[-3:]:

                    # check for deletion
                    if (seq_window[-4:][:3] == anchor[-3:]):
                        seq_window.strip()
                        logger.debug("Fuzzy with deletion " + seq_window)
                    # check for insertion
                    elif (seq_window[-3:] == anchor[-4:][:3]):
                        seq_window = seq_window + anchor[-1:]
                        logger.debug("fuzzy with insertion " + seq_window)

                # Found a fuzzy match within tolerances, so store it
                found_fuzzy = 1
                best_distance = dist
                best_position = pos
                fuzzy_match = seq_window
                if dist == 0:
                    found_exact = 1
                    break
    fuzz_right = ''
    if found_fuzzy:
        fuzzy_right = seq
        if (trim_type == 'internal'):
            seq = seq[:best_position + len(fuzzy_match)]
        else:
            seq = seq[:best_position]

        fuzzy_right = fuzz_right[len(seq):]
    return fuzz_right, best_distance, seq, fuzzy_match
def file_to_db_upload_seq(my_file_to_db_upload, filename, sequences):
    #     for filename in filenames:
    insert_seq_time_start = time.time()

    try:
        logger.debug("\n----------------\nfilename = %s" % filename)
        my_file_to_db_upload.seq.insert_seq(sequences)
        insert_seq_time = (time.time() - insert_seq_time_start)
        logger.debug("insert_seq() took %s sec to finish" % insert_seq_time)
    except:  # catch everything
        logger.error("\r[pipelineprocessor] Unexpected:")  # handle unexpected exceptions
        logger.error(sys.exc_info()[0])  # info about curr exception (type,value,traceback)
        raise  # re-throw caught exception
def trim_fuzzy_distal(anchors_list, seq, trim_type, start, end):
    """Doc string here.."""
    max_distance = 3
    best_distance = max_distance + 1
    found_fuzzy = 0
    fuzzy_match = ""
    for anchor in anchors_list:
        anchor_length = len(anchor)
        for pos in range(start,end):

            seq_window = seq[pos:anchor_length]

            dist = 0

            #dist1 = abs( Levenshtein.ratio( anchor,     seq_window ) )
           # dist2 = abs( Levenshtein.ratio( seq_window, anchor )     )
            dist1 = abs( levenshtein( anchor,     seq_window ) )
            dist2 = abs( levenshtein( seq_window, anchor )     )
            if dist1 >= dist2:  dist = dist1
            else:               dist = dist2

            if (dist <= max_distance) and (dist < best_distance) and (seq_window[:2] == anchor[:2]):
                if seq_window[-3:] != anchor[-3:]:

                    # check for deletion
                    if(seq_window[-4:][:3] == anchor[-3:]):
                        seq_window.strip()
                        logger.debug( "Fuzzy with deletion " + seq_window)
                    # check for insertion
                    elif(seq_window[-3:] == anchor[-4:][:3]):
                        seq_window = seq_window + anchor[-1:]
                        logger.debug("fuzzy with insertion " + seq_window)

                # Found a fuzzy match within tolerances, so store it
                found_fuzzy = 1;
                best_distance = dist;
                best_position = pos;
                fuzzy_match = seq_window;
                if dist == 0:
                    found_exact = 1
                    break
    fuzz_right = ''
    if found_fuzzy:
        fuzzy_right = seq
        if( trim_type == 'internal'):
            seq = seq[:best_position + len(fuzzy_match)]
        else:
            seq = seq[:best_position]

        fuzzy_right = fuzz_right[len(seq):]
    return fuzz_right, best_distance, seq, fuzzy_match
Exemplo n.º 27
0
 def is_local(self):
     curr_hostname = os.uname()[1]
     logger.debug("curr_hostname: %s" % curr_hostname)
     dev_comps = [
         "ashipunova.mbl.edu", "as-macbook.home", "as-macbook.local",
         "Ashipunova.local", "Annas-MacBook-new.local",
         "Annas-MacBook.local", "Anna's MacBook Pro",
         "annasmacbooknew.mbl.edu mblad.mbl.edu printers.mbl.edu jbpc.mbl.edu jbpc-np.mbl.edu",
         "AnnasMacBook.local"
     ]
     if curr_hostname in dev_comps:
         return True
     else:
         return False
    def get_gasta_result(self, filename):
        gast_file_name = self.gast_filename(filename)
        try:
            with open(gast_file_name) as fd:
                gast_dict = dict([(l.split("\t")[0], l.split("\t")[1:]) for l in fd])    
            return gast_dict
        except IOError, e:
#            print dir(e)
#['__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__getitem__', '__getslice__', '__hash__', '__init__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', 'args', 'errno', 'filename', 'message', 'strerror']
#            print "errno = %s" % e.errno
            logger.debug("errno = %s" % e.errno)
            if e.errno == 2:
                # suppress "No such file or directory" error
                pass            
Exemplo n.º 29
0
def file_to_db_upload_seq(my_file_to_db_upload, filename, sequences):
    #     for filename in filenames:
    insert_seq_time_start = time.time()

    try:
        logger.debug("\n----------------\nfilename = %s" % filename)
        my_file_to_db_upload.seq.insert_seq(sequences)
        insert_seq_time = (time.time() - insert_seq_time_start)
        logger.debug("insert_seq() took %s sec to finish" % insert_seq_time)
    except:  # catch everything
        logger.error("\r[pipelineprocessor] Unexpected:"
                     )  # handle unexpected exceptions
        logger.error(sys.exc_info()
                     [0])  # info about curr exception (type,value,traceback)
        raise  # re-throw caught exception
Exemplo n.º 30
0
    def create_chimera_cmd(self, ref_db):
        """
        /usr/local/bin/vsearch
        -uchime_denovo
        /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/reads_overlap/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chg
        -uchimeout
        /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.txt
        -chimeras
        /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.txt.chimeric.fa
        -notrunclabels
        ---
        /usr/local/bin/vsearch
        -uchime_ref
        /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/reads_overlap/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chg
        -uchimeout
        /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.db
        -chimeras
        /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.db.chimeric.fa
        -notrunclabels
        -strand
        plus
        -db
        /groups/g454/blastdbs/rRNA16S.gold.fasta

        """
        command_line = []

        ref_or_novo_options = {
            self.denovo_suffix: "-uchime_denovo",
            self.ref_suffix: "-uchime_ref"
        }
        for suff, opt in ref_or_novo_options.items():
            input_file_name = self.indir + "/$filename_base" + self.chg_suffix
            output_file_name = self.outdir + "/$filename_base" + self.chimeras_suffix + suff

            ref_add = ""
            if (opt == "-uchime_ref"):
                ref_add = "-strand plus -db %s" % ref_db

            uchime_cmd = """%s %s %s -uchimeout %s -chimeras %s%s -notrunclabels %s
            """ % (self.usearch_cmd, opt, input_file_name, output_file_name,
                   output_file_name, self.chimeric_suffix, ref_add)
            logger.debug("UUU = uchime_cmd = %s" % uchime_cmd)
            logger.debug("+++")
            command_line.append(uchime_cmd)

        return command_line
 def insert_taxonomy(self, fasta, gast_dict):
     if gast_dict:
         (taxonomy, distance, rank, refssu_count, vote, minrank, taxa_counts, max_pcts, na_pcts, refhvr_ids) = gast_dict[fasta.id]
         "if we already had this taxonomy in this run, just skip it"
         if taxonomy in self.tax_id_dict:
             next
         else:
             tax_id = self.get_id("taxonomy", taxonomy)
             if tax_id:
                 self.tax_id_dict[taxonomy] = tax_id
             else:
                 my_sql = """INSERT IGNORE INTO taxonomy (taxonomy) VALUES ('%s')""" % (taxonomy.rstrip())
                 tax_id = self.my_conn.execute_no_fetch(my_sql)
                 self.tax_id_dict[taxonomy] = tax_id
             return tax_id
     else:
         print "ERROR: can't read gast files! No taxonomy information will be processed."
         logger.debug("ERROR: can't read gast files! No taxonomy information will be processed.")
def process(run, steps):
    # create output directory:
    requested_steps = steps.split(",")            
    
    if not os.path.exists(run.output_dir):
        logger.debug("Creating output directory: "+run.output_dir)
        os.makedirs(run.output_dir)      
                    
    # loop through official list...this way we execute the
    # users requested steps in the correct order                
    for step in requested_steps:
        if step not in existing_steps:
            print "Invalid processing step: " + step
            sys.exit()
        else:
            # call the method in here
            step_method = globals()[step]
            step_method(run)
Exemplo n.º 33
0
    def check_if_array_job_is_done(self, job_name):
        cluster_done = False
        check_qstat_cmd_line = "qstat -r | grep %s | wc -l" % job_name
        logger.debug("check_qstat_cmd_line = %s" % check_qstat_cmd_line)
        try:
            p = subprocess.Popen(check_qstat_cmd_line, stdout=subprocess.PIPE, shell=True)
            (output, err) = p.communicate()
            num_proc = int(output)
            logger.debug("qstat is running %s '%s' processes" % (num_proc, job_name))
    #         pprint(p)

            if (num_proc == 0):
                cluster_done = True
    #         print("cluster_done from check_if_cluster_is_done = %s" % cluster_done)
        except:
            logger.error("%s can be done only on a cluster." % job_name)
            raise
        return cluster_done
def trim(runobj):
    # def is in utils.py
    # open_zipped_directory(runobj.run_date, runobj.output_dir)
    # (re) create the trim status file
    runobj.trim_status_file_h = open(runobj.trim_status_file_name, "w")

    # do the trim work
    mytrim = TrimRun(runobj)

    # pass True to write out the straight fasta file of all trimmed non-deleted seqs
    # Remember: this is before chimera checking
    if runobj.platform == "illumina":
        trim_codes = mytrim.trimrun_illumina(True)
    elif runobj.platform == "454":
        trim_codes = mytrim.trimrun_454(True)
    elif runobj.platform == "ion-torrent":
        trim_codes = mytrim.trimrun_ion_torrent(True)
    else:
        trim_codes = ["ERROR", "No Platform Found"]

    trim_results_dict = {}
    if trim_codes[0] == "SUCCESS":
        # setup to write the status
        new_lane_keys = trim_codes[2]
        trim_results_dict["status"] = "success"
        trim_results_dict["new_lane_keys"] = new_lane_keys
        logger.debug("Trimming finished successfully")
        # write the data files
        mytrim.write_data_files(new_lane_keys)
        runobj.trim_status_file_h.write(json.dumps(trim_results_dict))
        runobj.trim_status_file_h.close()
        runobj.run_status_file_h.write(json.dumps(trim_results_dict) + "\n")
        runobj.run_status_file_h.close()
    else:
        logger.debug("Trimming finished ERROR")
        trim_results_dict["status"] = "error"
        trim_results_dict["code1"] = trim_codes[1]
        trim_results_dict["code2"] = trim_codes[2]
        runobj.trim_status_file_h.write(json.dumps(trim_results_dict))
        runobj.trim_status_file_h.close()
        runobj.run_status_file_h.write(json.dumps(trim_results_dict) + "\n")
        runobj.run_status_file_h.close()
        sys.exit("Trim Error")
def process(runobj, steps):
    requested_steps = steps.split(",")
    if 'clean' in requested_steps and len(requested_steps) > 1:
        sys.exit("The clean step cannot be combined with other steps - Exiting")

    # Open run STATUS File here.
    # open in append mode because we may start the run in the middle
    # say at the gast stage and don't want to over write.
    # if we re-run trimming we'll get two trim status reports
    runobj.run_status_file_h = open(runobj.run_status_file_name, "a")

    # loop through official list...this way we execute the
    # users requested steps in the correct order

    for step in C.existing_steps:
        if step in requested_steps:
            # call the method in here
            logger.debug('RUN: %s' % step)
            step_method = globals()[step]
            step_method(runobj)
def trim_anchor_helper(anchor_name, expanded_anchor_sequences, freedom, length, start, sequence):
    exact = ''
    exactTrimmedOff = ''

    logger.debug( 'looking for anchor: ' + anchor_name + " start: " + str(start) + " length: " + str(length))
    max_divergence  = C.max_divergence
    logger.debug('anchor_list: ' + str(expanded_anchor_sequences))
    list_of_tuples = anchortrim.generate_tuples(start, freedom, length, list_of_tuples = [], reversed_read=False)
    logger.debug('anchor tuples: ' + str(list_of_tuples))
    anchor, location = anchortrim.find_best_distance(sequence, expanded_anchor_sequences, max_divergence, list_of_tuples)

    if anchor and location:
        logger.debug( 'anchor: ' + anchor + ' loc tuple: ' + str(location))
        trimmed_sequence = sequence[:location[1]] # same thing here for the reversed == False
        exact = anchor
        exactTrimmedOff = sequence[location[1]:]
    else:
        logger.debug( 'no anchor location found' )
        trimmed_sequence = sequence
    return exact, exactTrimmedOff, trimmed_sequence
def file_to_db_upload_all_but_seq(my_file_to_db_upload, filename, no_run_info_list, full_upload):
    total_time = 0

    try:
        my_file_to_db_upload.get_gast_result(os.path.basename(filename))

        filename_base_no_suff = get_filename_base_no_suff(filename)

        run_info_ill_id = my_file_to_db_upload.get_run_info_ill_id(filename_base_no_suff)
        if run_info_ill_id:
            my_file_to_db_upload.collect_project_ids(run_info_ill_id)
            seq_in_file = len(my_file_to_db_upload.seq.fasta_dict)
            my_file_to_db_upload.put_seq_statistics_in_file(filename, seq_in_file)
            total_time += seq_in_file

            start_fasta_next = time.time()

            start_insert_pdr_info_time = 0
            start_insert_pdr_info_time = time.time()

            my_file_to_db_upload.insert_pdr_info(run_info_ill_id)
            insert_pdr_info_time = (time.time() - start_insert_pdr_info_time)

            start_insert_taxonomy_time = 0
            start_insert_taxonomy_time = time.time()
            my_file_to_db_upload.insert_taxonomy()
            insert_taxonomy_time = (time.time() - start_insert_taxonomy_time)

            insert_sequence_uniq_info_time = 0
            start_insert_sequence_uniq_info_time = time.time()
            my_file_to_db_upload.insert_sequence_uniq_info()
            insert_sequence_uniq_info_time = (time.time() - start_insert_sequence_uniq_info_time)

            logger.debug("start_fasta_loop took %s sec to finish" % (time.time() - start_fasta_next))
            logger.debug("insert_pdf_info_query_time took %s sec to finish" % insert_pdr_info_time)
            logger.debug("start_insert_taxonomy_upload_time took %s sec to finish" % insert_taxonomy_time)
            logger.debug("insert_sequence_uniq_info_time took %s sec to finish" % insert_sequence_uniq_info_time)

            return total_time
        else:
            utils = PipelneUtils()

            no_run_info_list.append(filename_base_no_suff)
            utils.print_both(
                "ERROR: There is no run info for %s yet, please check if it's in the csv and uploaded to the db" % filename)
            return 0

    except:  # catch everything
        logger.error("\r[pipelineprocessor] Unexpected:")  # handle unexpected exceptions
        logger.error(sys.exc_info()[0])  # info about curr exception (type,value,traceback)
        raise  # re-throw caught exception
Exemplo n.º 38
0
 def update_sequence_uniq_info_ill(self, fasta, gast_dict):
     if gast_dict:
         (taxonomy, distance, rank, refssu_count, vote, minrank, taxa_counts, max_pcts, na_pcts, refhvr_ids) = gast_dict[fasta.id]
         seq_upper = fasta.seq.upper()
         sequence_ill_id = self.seq_id_dict[seq_upper]
         if taxonomy in self.tax_id_dict:
             try:
                 taxonomy_id = self.tax_id_dict[taxonomy] 
             except Exception, e:
                 logger.debug("Error = %s" % e)
                 raise
                   
         my_sql = """INSERT IGNORE INTO sequence_uniq_info_ill (sequence_ill_id, taxonomy_id, gast_distance, refssu_count, rank_id, refhvr_ids) VALUES
                (
                 %s,
                 %s,
                 '%s',
                 '%s',
                 (SELECT rank_id FROM rank WHERE rank = '%s'),
                 '%s'                
                )
                ON DUPLICATE KEY UPDATE
                    updated = (CASE WHEN taxonomy_id <> %s THEN NOW() ELSE updated END),
                    taxonomy_id = %s,
                    gast_distance = '%s',
                    refssu_count = '%s',
                    rank_id = (SELECT rank_id FROM rank WHERE rank = '%s'),
                    refhvr_ids = '%s'
                """ % (sequence_ill_id, taxonomy_id, distance, refssu_count, rank, refhvr_ids.rstrip(), taxonomy_id, taxonomy_id, distance, refssu_count, rank, refhvr_ids.rstrip())
                 
         # my_sql = """UPDATE IGNORE sequence_uniq_info_ill
         #             SET updated = (CASE WHEN taxonomy_id <> %s THEN NOW() ELSE updated END),
         #                 taxonomy_id = %s,
         #                 gast_distance = '%s',
         #                 refssu_count = '%s',
         #                 rank_id = (SELECT rank_id FROM rank WHERE rank = '%s'),
         #                 refhvr_ids = '%s'
         #             WHERE sequence_ill_id = %s
         #          """ % (taxonomy_id, taxonomy_id, distance, refssu_count, rank, refhvr_ids.rstrip(), sequence_ill_id)
         res_id = self.my_conn.execute_no_fetch(my_sql)
         return res_id
Exemplo n.º 39
0
def process(runobj, steps):
    requested_steps = steps.split(",")
    if 'clean' in requested_steps and len(requested_steps) > 1:
        sys.exit(
            "The clean step cannot be combined with other steps - Exiting")

    # Open run STATUS File here.
    # open in append mode because we may start the run in the middle
    # say at the gast stage and don't want to over write.
    # if we re-run trimming we'll get two trim status reports
    runobj.run_status_file_h = open(runobj.run_status_file_name, "a")

    # loop through official list...this way we execute the
    # users requested steps in the correct order

    for step in C.existing_steps:
        if step in requested_steps:
            # call the method in here
            logger.debug('RUN: %s' % step)
            step_method = globals()[step]
            step_method(runobj)
Exemplo n.º 40
0
    def chimera_checking(self):
        chimera_region_found = False

        file_list = self.dirs.get_all_files_by_ext(self.indir, self.chg_suffix)
        logger.debug("FFF = file_list = %s" % (file_list))

        #         TODO: method
        dna_region = list(
            set([
                self.runobj.samples[idx_key].dna_region
                for idx_key in self.input_file_names
            ]))[0]
        if dna_region in C.regions_to_chimera_check:
            chimera_region_found = True
        else:
            logger.debug('region not checked: ' + dna_region)
        ref_db = self.get_ref_db(dna_region)
        command_line = self.create_chimera_cmd(ref_db)
        sh_script_file_name = self.create_job_array_script(
            "chimera_checking", command_line, self.indir, file_list)
        script_file_name_full = os.path.join(self.indir, sh_script_file_name)
        self.utils.call_sh_script(script_file_name_full, self.indir)
        self.utils.print_both("self.dirs.chmod_all(%s)" % (self.indir))
        self.dirs.chmod_all(self.indir)
        logger.debug('sh_script_file_name: ' + sh_script_file_name)
        if not chimera_region_found:
            return ('NOREGION', 'No regions found that need checking', '')
        else:
            return ("The vsearch commands were created")
Exemplo n.º 41
0
def env454upload_main(runobj, full_upload):
    """
    Run: pipeline dbUpload testing -c test/data/JJH_KCK_EQP_Bv6v4.ini -s env454upload -l debug
    For now upload only Illumina data to env454 from files, assuming that all run info is already on env454 (run, run_key, dataset, project, run_info_ill tables)
    Tables:
    sequence_ill
    sequence_pdr_info_ill
    taxonomy
    sequence_uniq_info_ill

    """

    whole_start     = time.time()

    my_env454upload = dbUpload(runobj)
    filenames       = my_env454upload.get_fasta_file_names()
    if not filenames:
        logger.debug("\nThere is something wrong with fasta files or their names, please check pathes, contents and suffixes in %s." % my_env454upload.fasta_dir)

#     sequences = get_sequences(my_env454upload, filenames)
    for filename in filenames:
        sequences = my_env454upload.make_seq_upper(filename)
        if full_upload:
            env454upload_seq(my_env454upload, filename, sequences)
        wrapped   = wrapper(my_env454upload.get_seq_id_dict, sequences)
        get_seq_id_dict_time = timeit.timeit(wrapped, number=1)
        logger.debug("get_seq_id_dict() took %s sec to finish" % get_seq_id_dict_time)

    total_seq = env454upload_all_but_seq(my_env454upload, filenames, full_upload)
    my_env454upload.check_seq_upload()
    logger.debug("total_seq = %s" % total_seq)
    whole_elapsed = (time.time() - whole_start)
    print "The whole upload took %s s" % whole_elapsed
Exemplo n.º 42
0
def vampsupload(runobj):
    """
    Upload data files to VAMPS database
    """
    # for vamps 'new_lane_keys' will be prefix
    # of the uniques and names file
    # that was just created in vamps_gast.py
    # or we can get the 'lane_keys' directly from the config_file
    # for illumina:
    # a unique idx_key is a concatenation of barcode_index and run_key
    idx_keys = get_keys(runobj)

    #     if(runobj.vamps_user_upload):
    #         idx_keys = [runobj.user+runobj.runcode]
    #     else:
    #         idx_keys = convert_unicode_dictionary_to_str(json.loads(open(runobj.trim_status_file_name,"r").read()))["new_lane_keys"]

    # NOT NEEDED HERE: Find duplicate project names
    # if vamps user uploads this has already been done and this project is
    # already in vamps_upload_info table
    # if data from a csv file (illumina and 454) this also is not needed
    # as data is checked in metadata.py

    myvamps = Vamps(runobj, idx_keys)
    # Create files
    myvamps.create_vamps_files()
    # put files in db
    result_code = myvamps.load_vamps_db()

    if result_code[:5] == 'ERROR':
        logger.error("load_vamps_db failed")
        if runobj.vamps_user_upload:
            write_status_to_vamps_db(runobj.site, runobj.run, "GAST_ERROR",
                                     result_code)
        sys.exit("load_vamps_db failed")
    elif runobj.vamps_user_upload:
        logger.debug("Finished loading VAMPS data. %s" % result_code)
        write_status_to_vamps_db(runobj.site, runobj.run, 'GAST_SUCCESS',
                                 'Loading VAMPS Finished')
Exemplo n.º 43
0
    def check_if_array_job_is_done(self, job_name):
        cluster_done = False
        check_qstat_cmd_line = "qstat -r | grep %s | wc -l" % job_name
        logger.debug("check_qstat_cmd_line = %s" % check_qstat_cmd_line)
        try:
            p = subprocess.Popen(check_qstat_cmd_line,
                                 stdout=subprocess.PIPE,
                                 shell=True)
            (output, err) = p.communicate()
            num_proc = int(output)
            logger.debug("qstat is running %s '%s' processes" %
                         (num_proc, job_name))
            #         pprint(p)

            if (num_proc == 0):
                cluster_done = True

    #         print("cluster_done from check_if_cluster_is_done = %s" % cluster_done)
        except:
            logger.error("%s can be done only on a cluster." % job_name)
            raise
        return cluster_done
Exemplo n.º 44
0
 def __init__(self, lane_keys, runobj):
     self.inputFileName = {}
     self.orphans = {}
     self.lane_keys = lane_keys
     self.base_dir = runobj.output_dir
     self.trim_dir = os.path.join(self.base_dir, 'analysis/trimming')
     #self.chimera_dir = os.path.join(self.base_dir, 'analysis/chimera')
     self.deleted_ids = {}
     for lane_key in lane_keys:
         self.inputFileName[lane_key] = os.path.join(self.trim_dir, lane_key + ".trimmed.fa")
         self.orphans[lane_key] = {}
         deleted_file            = os.path.join(self.trim_dir, lane_key + ".deleted.txt" )
         self.deleted_ids[lane_key] = []
         if not (os.path.exists(deleted_file) and os.path.getsize(deleted_file) > 0):
             logger.debug("No deleted sequences for lane: " + lane_key)
             continue
         del_fh = open(deleted_file,"r")
         #deleted_id_list = self.deleted_ids[lane_key] = []
         for line in del_fh.readlines():
             lst = line.strip().split()                
             #deleted_id_list.append(lst[0])
             self.deleted_ids[lane_key].append(lst[0])
Exemplo n.º 45
0
 def chimera_reference(self,lane_keys):
 
     chimera_region_found = False
     output = {}
     cluster_id_list = []
     for lane_key in lane_keys:
         
         dna_region  = self.run.samples[lane_key].dna_region
         if dna_region in C.regions_to_chimera_check:
             chimera_region_found = True
         else:
             logger.debug('region not checked: ' + dna_region)                    
             continue
         
         out_fileName = self.prefix[lane_key] + ".chimeras.db"      
         
         # which ref db to use?
         ref_db = ''
         if dna_region.upper() == 'ITS':
             logger.debug("got an ITS dna region so using refdb: " + self.its_refdb)
             ref_db = self.its_refdb
         else:
             logger.debug("using standard refdb: " + self.refdb)
             ref_db = self.refdb
             
         uchime_cmd = ["clusterize"]
         uchime_cmd.append(self.usearch_cmd)
         uchime_cmd.append("--uchime")
         uchime_cmd.append(self.files[lane_key]['abund'])
         uchime_cmd.append("--uchimeout")
         uchime_cmd.append(out_fileName)
         uchime_cmd.append("--db")
         uchime_cmd.append(ref_db)
         
         
         try:
             print "chimera referenc command: " + str(uchime_cmd)
             output[lane_key] = subprocess.check_output(uchime_cmd)
             #print 'outsplit',output[lane_key].split()[2]
             cluster_id_list.append(output[lane_key].split()[2])
             #print 'Have %d bytes in output' % len(output)
             #print 'ref',lane_key,output,len(output)
             if len(output[lane_key]) < 50 and len(output[lane_key]) > 40:
                 logger.debug(lane_key + " uchime ref seems to have been submitted successfully")                    
             else:
                 print >>sys.stderr, "uchime ref may be broke"
            
         except OSError, e:
             print >>sys.stderr, "Execution failed:", e 
Exemplo n.º 46
0
def trim(runobj):
    # def is in utils.py
    # open_zipped_directory(runobj.run_date, runobj.output_dir)
    # (re) create the trim status file
    runobj.trim_status_file_h = open(runobj.trim_status_file_name, "w")
    idx_keys = get_keys(runobj)

    # do the trim work
    mytrim = TrimRun(runobj, idx_keys)

    # pass True to write out the straight fasta file of all trimmed non-deleted seqs
    # Remember: this is before chimera checking
    # trim_codes should alwas be a tuple with 3 elements!
    if runobj.vamps_user_upload:
        trim_codes = mytrim.trimrun_vamps(True)
    else:
        if runobj.platform == 'illumina':
            trim_codes = mytrim.filter_illumina()
        #        trim_codes = mytrim.trim_illumina(file_list = trim_codes[2])
        elif runobj.platform == '454':
            trim_codes = mytrim.trimrun_454(True)
        elif runobj.platform == 'ion-torrent':
            trim_codes = mytrim.trimrun_ion_torrent(True)
        else:
            trim_codes = ('ERROR', 'No Platform Found', '')

    trim_results_dict = {}
    if trim_codes[0] == 'SUCCESS':
        # setup to write the status
        new_lane_keys = trim_codes[2]
        trimmed_seq_count = trim_codes[1]
        if trimmed_seq_count == 0 or trimmed_seq_count == '0':
            trim_results_dict['status'] = "ERROR"
            logger.debug("Trimming finished: ERROR: no seqs passed trim")
        else:
            trim_results_dict['status'] = "success"
            logger.debug("Trimming finished successfully")

        trim_results_dict['new_lane_keys'] = new_lane_keys
        trim_results_dict['trimmed_seq_count'] = trimmed_seq_count

        # write the data files

        mytrim.write_data_files(new_lane_keys)
        runobj.trim_status_file_h.write(json.dumps(trim_results_dict) + "\n")
        runobj.trim_status_file_h.close()
        runobj.run_status_file_h.write(json.dumps(trim_results_dict) + "\n")
        runobj.run_status_file_h.close()
    else:
        logger.debug("Trimming finished ERROR")
        trim_results_dict['status'] = "ERROR"
        trim_results_dict['code1'] = trim_codes[1]
        trim_results_dict['code2'] = trim_codes[2]
        runobj.trim_status_file_h.write(json.dumps(trim_results_dict) + "\n")
        runobj.trim_status_file_h.close()
        runobj.run_status_file_h.write(json.dumps(trim_results_dict) + "\n")
        runobj.run_status_file_h.close()
        sys.exit("Trim Error")
Exemplo n.º 47
0
    def write_clean_fasta_file(self):
        """
        def to write a new fasta from the original fasta file
                using the deleted file

        The deleted file contains the trimming deleted as well
        as the chimera deleted
        Then write the uniques from Meren's fastalib
        """
        sleep(2)
        for lane_key in self.lane_keys:
            logger.debug("write_clean_fasta_file working on lanekey: " +
                         lane_key)
            deleted_id_list = []
            original_trimmed_file = os.path.join(self.trim_dir,
                                                 lane_key + ".trimmed.fa")
            new_trimmed_file_name = os.path.join(self.trim_dir,
                                                 lane_key + ".newtrimmed.fa")
            new_trimmed_file = fa.FastaOutput(new_trimmed_file_name)

            # open trimmed file and read a line
            trimmedfasta = fa.SequenceSource(original_trimmed_file)
            logger.debug(
                "write_clean_fasta_file about to check trimmedfasta file")
            deleted_id_list = self.deleted_ids[lane_key]
            if len(deleted_id_list) == 0:
                continue
            while trimmedfasta.next():
                if trimmedfasta.id not in deleted_id_list:
                    new_trimmed_file.store(trimmedfasta)
            new_trimmed_file.close()

            # rename to newtrimmed => trimmed
            os.rename(
                original_trimmed_file,
                os.path.join(self.trim_dir,
                             lane_key + ".trimmed_with_chimera.fa"))
            os.rename(new_trimmed_file_name, original_trimmed_file)
def trim(runobj):
    # def is in utils.py
    #open_zipped_directory(runobj.run_date, runobj.output_dir)
    # (re) create the trim status file
    runobj.trim_status_file_h = open(runobj.trim_status_file_name, "w")
    idx_keys = get_keys(runobj)
    
    # do the trim work
    mytrim = TrimRun(runobj, idx_keys) 
    
    # pass True to write out the straight fasta file of all trimmed non-deleted seqs
    # Remember: this is before chimera checking
    # trim_codes should alwas be a tuple with 3 elements!
    if runobj.vamps_user_upload:
        trim_codes = mytrim.trimrun_vamps(True)
    else:
        if runobj.platform == 'illumina':
            trim_codes = mytrim.filter_illumina()
    #        trim_codes = mytrim.trim_illumina(file_list = trim_codes[2])
        elif runobj.platform == '454':
            trim_codes = mytrim.trimrun_454(True)
        elif runobj.platform == 'ion-torrent':
            trim_codes = mytrim.trimrun_ion_torrent(True)        
        else:
            trim_codes = ('ERROR','No Platform Found','')
        
    trim_results_dict = {}
    if trim_codes[0] == 'SUCCESS':
        # setup to write the status
        new_lane_keys = trim_codes[2]
        trimmed_seq_count = trim_codes[1]
        if trimmed_seq_count == 0 or trimmed_seq_count == '0':
            trim_results_dict['status'] = "ERROR"
            logger.debug("Trimming finished: ERROR: no seqs passed trim")
        else:
            trim_results_dict['status'] = "success"
            logger.debug("Trimming finished successfully")
        
        trim_results_dict['new_lane_keys'] = new_lane_keys
        trim_results_dict['trimmed_seq_count'] = trimmed_seq_count
        
        # write the data files
        
        mytrim.write_data_files(new_lane_keys)
        runobj.trim_status_file_h.write(json.dumps(trim_results_dict)+"\n")
        runobj.trim_status_file_h.close()
        runobj.run_status_file_h.write(json.dumps(trim_results_dict)+"\n")
        runobj.run_status_file_h.close()
    else:
        logger.debug("Trimming finished ERROR")
        trim_results_dict['status'] = "ERROR"
        trim_results_dict['code1'] = trim_codes[1]
        trim_results_dict['code2'] = trim_codes[2]
        runobj.trim_status_file_h.write(json.dumps(trim_results_dict)+"\n")
        runobj.trim_status_file_h.close()
        runobj.run_status_file_h.write(json.dumps(trim_results_dict)+"\n")
        runobj.run_status_file_h.close()
        sys.exit("Trim Error")
Exemplo n.º 49
0
 def insert_sequence_uniq_info_ill(self, fasta, gast_dict):
     if gast_dict:
         (taxonomy, distance, rank, refssu_count, vote, minrank, taxa_counts, max_pcts, na_pcts, refhvr_ids) = gast_dict[fasta.id]
         seq_upper = fasta.seq.upper()
         sequence_ill_id = self.seq_id_dict[seq_upper]
         if taxonomy in self.tax_id_dict:
             try:
                 taxonomy_id = self.tax_id_dict[taxonomy] 
             except Exception, e:
                 logger.debug("Error = %s" % e)
                 raise
         my_sql = """INSERT IGNORE INTO sequence_uniq_info_ill (sequence_ill_id, taxonomy_id, gast_distance, refssu_count, rank_id, refhvr_ids) VALUES
                (
                 %s,
                 %s,
                 '%s',
                 '%s',
                 (SELECT rank_id FROM rank WHERE rank = '%s'),
                 '%s'                
                )
                """ % (sequence_ill_id, taxonomy_id, distance, refssu_count, rank, refhvr_ids.rstrip())
         res_id = self.my_conn.execute_no_fetch(my_sql)
         return res_id
def trim_stop_seq( stop_seqs, seq, trim_type, start, end ):

    for anchor in stop_seqs:
        anchor_length = len(anchor)
        logger.debug("trim_stop_seq: " + anchor + " " + str(start) + " " + str(end)  + " " + str(len(seq)))
        for pos in range(start,end):
            seq_window = seq[pos:pos+anchor_length]


            dist = abs( Levenshtein.ratio( anchor,     seq_window ) )
            #dist2 = abs( Levenshtein.ratio( seq_window, anchor )     )
            if dist == 1.0:
                # perfect match
                # do I trim off before or after anchor?
                #before
                return anchor,seq[pos+anchor_length:],seq[:pos]
                #after (include anchor in trimmed seq):
                #return anchor,seq[pos:],seq[:pos+anchor_length]
            if dist >= C.max_divergence:
                pass
            #print(pos,seq_window,dist1,dist2)

    return '','',seq
Exemplo n.º 51
0
def trim_stop_seq(stop_seqs, seq, trim_type, start, end):

    for anchor in stop_seqs:
        anchor_length = len(anchor)
        logger.debug(anchor + " " + str(start) + " " + str(end) + " " +
                     str(len(seq)))
        for pos in range(start, end):
            seq_window = seq[pos:pos + anchor_length]

            dist = abs(Levenshtein.ratio(anchor, seq_window))
            #dist2 = abs( Levenshtein.ratio( seq_window, anchor )     )
            if dist == 1.0:
                # perfect match
                # do I trim off before or after anchor?
                #before
                return anchor, seq[pos + anchor_length:], seq[:pos]
                #after (include anchor in trimmed seq):
                #return anchor,seq[pos:],seq[:pos+anchor_length]
            if dist >= C.max_divergence:
                pass
            #print pos,seq_window,dist1,dist2

    return '', '', seq
Exemplo n.º 52
0
    def run_until_done_on_cluster(self, job_name):
        start = time.time()
        time_before = self.get_time_now()
        logger.debug("time_before = %s" % time_before)
        logger.debug("Waiting for the cluster...")
        while True:
            if self.is_local():
                time.sleep(1)
            else:
                time.sleep(120)
            cluster_done = self.check_if_array_job_is_done(job_name)
            logger.debug("cluster_done = %s" % cluster_done)
            if (cluster_done):
                break

        elapsed = (time.time() - start)
        logger.debug("Cluster is done with %s in: %s" % (job_name, elapsed))
Exemplo n.º 53
0
def illumina_chimera_after_cluster(runobj):
    mychimera = Chimera(runobj)

    mychimera.illumina_rm_size_files()
    start = time.time()
    mychimera.illumina_size_to_freq_in_chimer()
    elapsed = (time.time() - start)
    logger.debug("illumina_size_to_freq_in_chimer time: %s" % elapsed)

    #     start = time.time()
    #     logger.debug("Check chimeric statistics. If ref > 15% and ratio ref to de-novo > 2 use only de-novo")
    #     mychimera.check_chimeric_stats()
    #     elapsed = (time.time() - start)
    #     logger.debug("check_chimeric_stats time: %s" % elapsed)

    start = time.time()
    logger.debug("Creating nonchimeric files in %s" % mychimera.indir)
    mychimera.move_out_chimeric()
    elapsed = (time.time() - start)
    logger.debug("move_out_chimeric time: %s" % elapsed)
    logger.debug("illumina_chimera_after_cluster time = %s" % str(elapsed))
Exemplo n.º 54
0
def clean(runobj):
    """
    Removes a run from the database and output directory
    """

    answer = raw_input("\npress 'y' to delete the run '" + runobj.run_date +
                       "': ")
    if answer == 'y' or answer == 'Y':

        for (archiveDirPath, dirNames,
             file_names) in os.walk(runobj.output_dir):
            logger.debug("Removing run:", runobj.run_date)
            for f in file_names:
                file_path = os.path.join(runobj.output_dir, f)
                logger.debug("file_path: ")
                logger.debug(file_path)
                os.remove(os.path.join(runobj.output_dir, f))
Exemplo n.º 55
0
    def check_projects_and_datasets(self, data):
        self.get_my_conn()
        project_dataset = {}
        projects = {}
        datasets = {}
        error   =False
        warn    =False
        for item in data:
            if item != 'general':
                #project_dataset[data[item]['project']+'--'+data[item]['dataset']] = 1
                datasets[data[item]['dataset']] = data[item]['project']
                projects[data[item]['project']] = 1
        for p in projects:
            #print(p)
            my_sql = """SELECT project FROM project WHERE project = '%s'""" % (p)
            res    = self.my_conn.execute_fetch_select(my_sql)
            if res:
                logger.warning("project '"+p+"' already exists in the database - is this okay?")
                warn = True
            else:
                logger.debug("project '"+p+"' is new")

            ds_found_count = 0
            for d in datasets:
                if datasets[d] == p:

                    #print("\t%s" % (d))
                    my_sql = """SELECT dataset FROM dataset WHERE dataset = '%s'""" % (d)
                    res    = self.my_conn.execute_fetch_select(my_sql)
                    if res:
                        ds_found_count += 1
                        if ds_found_count >3:
                            logger.warning("\t\tPossibly more .... - Exiting after just three")
                            break
                        logger.warning("\tdataset '"+d+"' already exists in the database - is this okay?")
                        warn=True
                    else:
                        logger.debug("\tdataset '"+d+"' is new")
            logger.debug("\tDataset Count: "+str(len(datasets)))
        return (error,warn)
Exemplo n.º 56
0
def get_keys(runobj):
    try:
        idx_keys = convert_unicode_dictionary_to_str(
            json.loads(open(runobj.trim_status_file_name,
                            "r").read()))["new_lane_keys"]
        # {"status": "success", "new_lane_keys": ["1_GATGA"]}
    except:
        # here we have no idx_keys - must create them from run
        # if illumina they are index_runkey_lane concatenation
        # if 454 the are lane_key
        if runobj.vamps_user_upload:
            # logger.debug('KEYS: '+' '.join(runobj.run_keys))
            idx_keys = runobj.samples.keys()
        else:
            if runobj.platform == 'illumina':
                idx_keys = runobj.idx_keys
                ct = 0
                for h in runobj.samples:
                    logger.debug("get_keys, h:")
                    logger.debug(h)
                    #                    logger.debug(h,runobj.samples[h]) #TypeError: not all arguments converted during string formatting
                    ct += 1
            elif runobj.platform == '454':
                idx_keys = runobj.idx_keys
            elif runobj.platform == 'ion_torrent':
                idx_keys = runobj.idx_keys
            else:
                logger.debug("GAST: No keys found - Exiting")
                runobj.run_status_file_h.write(
                    "GAST: No keys found - Exiting\n")
                sys.exit()

    if isinstance(idx_keys, str):
        return idx_keys.split(',')
    elif isinstance(idx_keys, list):
        return idx_keys
    else:
        return None
Exemplo n.º 57
0
    v.convert_and_save_ini(data_object['output_dir'])

    data_object = v.validate(data_object['output_dir'])
    #general_data = v.get_general_data()

    answer = v.get_confirmation(args.steps, data_object['general'])
    #print('do2',data_object)
    if answer == 'q':
        sys.exit()
    elif answer == 'v':
        # view CONFIG file contents
        fh = open(
            os.path.join(dirs.analysis_dir,
                         data_object['general']['run'] + '.ini'))
        lines = fh.readlines()
        logger.debug("\n=== START ===\n")
        for line in lines:
            line = line.strip()
            logger.debug("line in INI: ")
            logger.debug(line)
        logger.debug("==== END ====\n")
        sys.exit()
    elif answer != 'c':
        sys.exit()
    ##############
    #
    # CREATE THE RUN OBJECT (see runconfig.py for details)
    #
    ##############
    runobj = Run(data_object, os.path.dirname(os.path.realpath(__file__)))
Exemplo n.º 58
0
def wait_for_cluster_to_finish(my_running_id_list):
    #print('My IDs',running_id_list)
    logger.debug('Max run time set to ' + str(C.cluster_max_wait) + ' seconds')
    logger.debug('These are my running qsub IDs ' + str(my_running_id_list))
    my_working_id_list = my_running_id_list

    counter = 0

    time.sleep(C.cluster_initial_check_interval)

    while my_working_id_list:

        qstat_codes = get_qstat_id_list()
        if not qstat_codes['id']:
            #print('No qstat ids')
            logger.debug(
                "id list not found: may need to increase initial_interval if you haven't seen running ids."
            )
            return (
                'SUCCESS',
                'id list not found',
                '',
            )
        if 'Eqw' in qstat_codes['code']:
            logger.debug(
                "Check cluster: may have error code(s), but they may not be mine!"
            )

        got_one = False

        #print('working ids',my_working_id_list)
        if my_working_id_list[0] in qstat_codes['id']:

            got_one = True
            name = qstat_codes['name'][qstat_codes['id'].index(
                my_working_id_list[0])]
            user = qstat_codes['user'][qstat_codes['id'].index(
                my_working_id_list[0])]
            code = qstat_codes['code'][qstat_codes['id'].index(
                my_working_id_list[0])]

            if code == 'Eqw':
                return ('FAIL', 'Found Eqw code', my_working_id_list[0])
            elif code == 'qw':
                logger.debug("id is still queued: " +
                             str(my_working_id_list[0]) + " " + str(code))
            elif code == 'r':
                logger.debug("id is still running: " +
                             str(my_working_id_list[0]) + " " + str(code))
            else:
                logger.debug('Unknown qstat code ' + str(code))
        else:
            my_working_id_list = my_working_id_list[1:]
            logger.debug('id finished ' + str(my_working_id_list))

        if not my_working_id_list:
            return ('SUCCESS', 'not my_working_id_list', '')
        #if not got_one:
        #print('IN not got one',)
        #    return ('SUCCESS','not got one','')

        time.sleep(C.cluster_check_interval)
        counter = counter + C.cluster_check_interval
        if counter >= C.cluster_max_wait:
            return ('FAIL', 'Max Time exceeded', C.cluster_max_wait)

    return ('FAIL', 'Unknown', 'Unknown')
Exemplo n.º 59
0
 def delete_file(self, filename):
     try:
         os.remove(filename)
         logger.debug("DELETE %s" % (filename))
     except OSError:
         pass
Exemplo n.º 60
0
 def print_both(self, message):
     logger.debug("print_both: ")
     print(message)
     logger.debug(message)