def main(argv): args = parseArgs(argv) try: if not os.path.isdir(args.count_directory): raise NotADirectoryError('ERROR: %s does not exist.' % args.count_directory) if not os.path.isfile(args.query_sheet): raise NotADirectoryError('ERROR: %s does not exist.' % args.count_directory) except FileNotFoundError: print('path to %s does not exist') else: count_dirpath = args.count_directory query_sheet_path = args.query_sheet query_df = utils.readInDataframe(query_sheet_path) # extract count files from count_dir count_dir_file_list = glob.glob( os.path.join(count_dirpath, '*read_count.tsv')) # TODO: SOME ERROR CHECKING ON THE FASTQFILENAME? # all crypto records will have genotype beginning with CNAG_, used this to extract list of crypto and yeast samples from query crypto_sample_list = list( query_df[query_df.genotype1.str.startswith('CNAG')].fastqFileName ) #TODO: after metadata organism column added, update this section s288c_r64_sample_list = list( query_df[~query_df.genotype1.str.startswith('CNAG')].fastqFileName) # split list of count files based on membership in dataframes above count_files_by_organism_dict = { 'KN99': [ x for x in count_dir_file_list if os.path.basename(x.replace('_read_count.tsv', '.fastq.gz')) in crypto_sample_list ], 'S288C_R64': [ x for x in count_dir_file_list if os.path.basename(x.replace('_read_count.tsv', '.fastq.gz')) in s288c_r64_sample_list ] } # create and write out count sheets for organism, count_file_list in count_files_by_organism_dict.items(): if len(count_file_list) > 0: od = OrganismData(organism=organism, config_file=args.config_file, interactive=args.interactive) count_df = od.createCountSheet(count_file_list) output_path = os.path.join( utils.dirPath(utils.dirPath(count_file_list[0])), '%s_raw_count.csv' % organism) print('writing count file to %s' % output_path) count_df.to_csv(output_path, index=False)
def createOrganismDataLogger(self): """ create logger for OrganismData :raises: NotADirectoryError if logger_directory_path does not exist """ logger_directory_path = utils.dirPath(self.log_file_path) if os.path.isdir(logger_directory_path): self.logger = utils.createStandardObjectChildLogger(self, __name__) else: raise NotADirectoryError('LogDirectoryDoesNotExist')
def subdirectoryReport(self, subdirectory_name, subdir_filepath_list, key_column_only_report=False): """ """ print('Checking %s column name formatting and entries' % subdirectory_name) specs_website = 'https://github.com/BrentLab/database_files/wiki' with open(self.accuracy_check_output_file, 'a') as subdirectory_report: subdirectory_report.write( 'Checking %s for adherence to specifications found at: %s\n' % (subdirectory_name, specs_website)) subdirectory_report.write( 'Last update (likely git pull) to directory: %s\n\n' % self.last_git_change) for subdirectory_filepath in subdir_filepath_list: self.logger.debug('Checking %s:\n' % subdirectory_filepath) # extract dictionaries of inconsistencies in column names and rows col_inconsistencies_dict, row_inconsistencies_dict = self.checkColumns( self.specification_dict[subdirectory_name], subdirectory_filepath) # check the format of the filename if not self.checkFileName(subdirectory_name, subdirectory_filepath): subdirectory_report.write( '\tThe filename %s does not adhere to the specifications. Please correct.\n\n' % os.path.basename(subdirectory_filepath)) # check column headings lines_to_write = ['In sheet %s:\n\tThe items below are column headings in a given sheet that do not match ' \ 'the specifications (key and non-key, this should be fixed when found).\n' %subdirectory_filepath] for spec_column, sheet_column in col_inconsistencies_dict.items( ): lines_to_write.append( '\tThe specification is: %s, the sheet column is: %s\n' % (spec_column, sheet_column)) lines_to_write.append( '\n\tThe items below are numbered by row (eg 1: inductionDelay means a problem in row 1 of inductionDelay). If shortReport, only key columns are checked:\n' ) for row_index, column_heading in row_inconsistencies_dict.items( ): # if short_report flag == True, only write out if the column_heading is a key column subdir_key_set = set( self.key_column_dict[utils.pathBaseName( utils.dirPath(subdirectory_filepath))]) current_column_heading_set = set(column_heading) # determine if column heading is in key column set key_set_diff_length = len(subdir_key_set - current_column_heading_set) if not key_column_only_report or (len(subdir_key_set) != key_set_diff_length): lines_to_write.append( '\tRow %s has an inconsistency in column %s\n' % (row_index, column_heading)) # if no columns found to have inconsistencies, remove the header line for this section from the lines_to_write list if lines_to_write[-1].endswith( 'only key columns are checked:\n'): lines_to_write.pop(-1) # if no column headings are found to be inconsistent, don't write at all. otherwise, write out the lines if not lines_to_write[-1].endswith( 'this should be fixed when found).\n'): lines_to_write.append('\n\n\n\n') subdirectory_report.write(''.join(lines_to_write))
def parseGeneCount(self, htseq_counts_path): """ NOTE: SPECIFICALLY SET UP FOR CRYPTO count the gene counts that mapped either to genes (see COUNT_VARS at top of script for other features) :param htseq_counts_path: a path to a _read_count.tsv file (htseq-counts output) :returns: a dictionary with the keys FEATURE_ALIGN_NOT_UNIQUE, TOO_LOW_AQUAL, AMBIGUOUS_FEATURE, NO_FEATURE, NOT_ALIGNED_TOTAL """ sample_name = utils.pathBaseName(htseq_counts_path).replace( '_read_count', '') try: genotype = [ self.extractInfoFromQuerySheet(sample_name, 'genotype1'), None ] perturbation = [ self.extractInfoFromQuerySheet(sample_name, 'perturbation1'), None ] except KeyError: self.logger.info('Not in query sheet: %s' % htseq_counts_path) sys.exit( 'Count file passed to one of the quality assessment objects was not in the query sheet. These * should be * filtered out in the qual_assess_1 script' ) try: # extract genotype2 or set it to None genotype[1] = self.extractInfoFromQuerySheet( sample_name, 'genotype2') perturbation[1] = self.extractInfoFromQuerySheet( sample_name, 'perturbation2') except KeyError: self.logger.debug( "%s has no genotype2 and/or perturbation2 -- may need to check script if this is expected" % sample_name) else: library_metadata_dict = {} # TODO: error checking on keys htseq_file = open(htseq_counts_path, 'r') htseq_file_reversed = reversed(htseq_file.readlines()) crypto_protein_coding_count = 0 line = next(htseq_file_reversed) try: while True: line_strip_split = line.strip().split('\t') if line.startswith('CKF44'): # split the line, take the entry in the second column, which is the gene count, and add to crypto_protein_coding_effective_count gene_count = int(line_strip_split[1]) crypto_protein_coding_count += gene_count if not (line.startswith('CNAG') or line.startswith('CKF44')): # strip newchar, split on tab line = line.strip().split('\t') # extract the category of metadata count (eg __alignment_not_unique --> ALIGNMENT_NOT_UNIQUE) htseq_count_metadata_category = line_strip_split[0][ 2:].upper() # drop the __ in front of the category # enter to htseq_count_dict library_metadata_dict.setdefault( htseq_count_metadata_category, int(line[1])) # iterate line = next(htseq_file_reversed) except StopIteration: pass # error check gene count try: if crypto_protein_coding_count == 0: raise ValueError('NoGeneCountsDetected') except ValueError: self.logger.info( 'no lines start with CKF44 -- check organism: %s' % htseq_file) print('No lines starting with CKF44 have gene counts') # rename some key/value pairs library_metadata_dict[ 'NOT_ALIGNED_TOTAL'] = library_metadata_dict.pop('NOT_ALIGNED') library_metadata_dict[ 'FEATURE_ALIGN_NOT_UNIQUE'] = library_metadata_dict.pop( 'ALIGNMENT_NOT_UNIQUE') library_metadata_dict[ 'AMBIGUOUS_FEATURE'] = library_metadata_dict.pop('AMBIGUOUS') # add PROTEIN_CODING_COUNTED library_metadata_dict[ 'PROTEIN_CODING_COUNTED'] = crypto_protein_coding_count # add log2cpm data -- note, this will look in the run_####_samples directory of subdir count log2cpm_path = os.path.join( utils.dirPath(utils.dirPath(htseq_counts_path)), '%s_log2_cpm.csv' % self.organism) try: if not os.path.isfile(log2cpm_path): raise FileNotFoundError('log2cpm_pathDNE: %s' % log2cpm_path) except FileNotFoundError: msg = ' Output of log2cpm.R, which requires output of %s_raw_counts.py, ' \ 'must be in run_####_samples directory containing subdir count. ' \ 'This doesn\'t exist in %s' %(self.organism, sample_name) print(msg) self.logger.critical(msg) library_metadata_dict['NAT_LOG2CPM'] = self.extractLog2cpm( 'CNAG_NAT', sample_name, log2cpm_path) library_metadata_dict['G418_LOG2CPM'] = self.extractLog2cpm( 'CNAG_G418', sample_name, log2cpm_path) print("...extracting genotype log2cpm -- TESTING TESTING TESTING") if genotype[0] != 'CNAG_00000': library_metadata_dict[ 'GENOTYPE1_LOG2CPM'] = self.extractLog2cpm( genotype[0].replace("CNAG", "CKF44"), sample_name, log2cpm_path) if genotype[1] is not None: library_metadata_dict[ 'GENOTYPE2_LOG2CPM'] = self.extractLog2cpm( genotype[1].replace("CNAG", "CKF44"), sample_name, log2cpm_path) if perturbation[0] == "over": sample_medium = self.extractInfoFromQuerySheet( sample_name, 'treatment') sample_temperature = self.extractInfoFromQuerySheet( sample_name, 'temperature') sample_atmosphere = self.extractInfoFromQuerySheet( sample_name, 'atmosphere') sample_timepoint = self.extractInfoFromQuerySheet( sample_name, 'timePoint') perturbed_gene = genotype.replace('_over', '').replace('CNAG', 'CKF44') # THIS NEEDS TO BE UPDATED WITH NEW MEDIAN_LOG2CPM BY WILDTYPE REPLICATE GROUPS WHEN TREATMENT COLUMNS ARE STABLE AGAIN library_metadata_dict[ 'OVEREXPRESSION_FOW'] = 0 #self.foldOverWildtype(perturbed_gene, sample_name, log2cpm_path, [sample_medium, sample_temperature, sample_atmosphere], sample_timepoint) htseq_file.close() return library_metadata_dict