def check_args(args): ''' Check the input arguments. ''' # initialize the control variable OK = True # check "ngshelper_database" if args.ngshelper_database is None: xlib.Message.print('error', '*** The NGShelper database is not indicated in the input arguments.') OK = False # check "alignment_file" if args.alignment_file is None: xlib.Message.print('error', '*** The alignment file is not indicated in the input arguments.') OK = False elif not os.path.isfile(args.alignment_file): xlib.Message.print('error', f'*** The file {args.alignment_file} does not exist.') OK = False # check "verbose" if args.verbose is None: args.verbose = xlib.Const.DEFAULT_VERBOSE elif not xlib.check_code(args.verbose, xlib.get_verbose_code_list(), case_sensitive=False): xlib.Message.print('error', f'*** verbose has to be {xlib.get_verbose_code_list_text()}.') OK = False if args.verbose.upper() == 'Y': xlib.Message.set_verbose_status(True) # check "trace" if args.trace is None: args.trace = xlib.Const.DEFAULT_TRACE elif not xlib.check_code(args.trace, xlib.get_trace_code_list(), case_sensitive=False): xlib.Message.print('error', f'*** trace has to be {xlib.get_trace_code_list_text()}.') OK = False if args.trace.upper() == 'Y': xlib.Message.set_trace_status(True) # if there are errors, exit with exception if not OK: raise xlib.ProgramException('', 'P001')
def check_infrastructure_software(): ''' Check if the infrastructure software is setup. ''' # initialize the control variable OK = True #check blastx if sys.platform.startswith('linux') or sys.platform.startswith('darwin'): command = 'blastx -h >/dev/null 2>&1' elif sys.platform.startswith('win32') or sys.platform.startswith('cygwin'): command = 'blastx.exe -h 1>null 2>&1' rc = subprocess.call(command, shell=True) if rc != 0: OK = False xlib.Message.print('error', 'blastx is not found.') # if there is software not found, exit with exception if not OK: raise xlib.ProgramException('', 'I001')
def form_update_region_zone(): ''' Update the current region and zone names in the NGScloud config file corresponding to the envoronment. ''' # initialize the control variable OK = True # print the header clib.clear_screen() clib.print_headers_with_environment('Configuration - Update region and zone') # input new current region and zone print(xlib.get_separator()) region_name = cinputs.input_region_name(region_name, help=True) zone_name = cinputs.input_zone_name(region_name, zone_name, help=True) # get the NGScloud config file ngscloud_config_file = xconfiguration.get_ngscloud_config_file() # confirm the region and zone update in the NGScloud config file print(xlib.get_separator()) OK = clib.confirm_action('The file {0} is going to be update with the new region and zone.'.format(ngscloud_config_file)) # save the options dictionary in the NGScloud config file if OK: print(xlib.get_separator()) print('The file {0} is being update with the new region and zone ...'.format(ngscloud_config_file)) (OK, error_list) = xconfiguration.update_region_zone_data(region_name, zone_name) if OK: print('The config file has been update.') else: for error in error_list: print(error) raise xlib.ProgramException('C001') # show continuation message print(xlib.get_separator()) input('Press [Intro] to continue ...')
def get_genomic_features_dict(conn, transcript_seq_id, transcript_start, transcript_end): ''' Get a sequence feature dictionary from the table "genomic_features" corresponding to a sequence identification and its start less than or equal to the transcript start. ''' # initialize the sequence feature dictionary genomic_feature_dict = {} # initialize the dictionary key key = 0 # select rows from the table "genomic_features" sentence = f''' SELECT seq_id, start, end, type, gene FROM genomic_features WHERE seq_id = "{transcript_seq_id}" AND start <= {transcript_start} AND end >= {transcript_end}; ''' try: rows = conn.execute(sentence) except Exception as e: raise xlib.ProgramException(e, 'B002', sentence, conn) # add row data to the dictionary for row in rows: genomic_feature_dict[key] = { 'seq_id': row[0], 'start': row[1], 'end': row[2], 'type': row[3], 'gene': row[4] } key += 1 # return the sequence feature dictionary return genomic_feature_dict
def calculate_haplotype_statistics(loci_file_path, stats_file_path): ''' Calculates haplotype statistics per locus. ''' # open the loci file try: loci_file_id = open(loci_file_path, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', loci_file_path) # set the pattern of the locus id records pattern1 = r'^\/\/(.*)\|(.*)\|$' # set the pattern of the locus information records pattern2 = r'^(.*) (.*)$' # initialize the list of locus information records locus_line_list = [] # initialize the dictionary of haplotype sequence number by locus haplotype_number_by_locus_dict = {} # initialize the dictionary of haplotype sequences in the locus haplotype_seqs_in_locus_dict = {} # read the first record of complete loci file record = loci_file_id.readline() # while there are records while record != '': # process the locus id record if record.startswith('//'): # extract the locus id mo = re.search(pattern1, record) variant_seq = mo.group(1) locus_id = mo.group(2) # write in locus statistics for i in range(len(locus_line_list)): # extract the taxon id and sequence mo = re.search(pattern2, locus_line_list[i]) taxon_id = mo.group(1).strip() sequence = mo.group(2).strip() # add the sequence to the dictionary of haplotype sequences in the locus if sequence not in haplotype_seqs_in_locus_dict: haplotype_seqs_in_locus_dict[sequence] = sequence # calculate de variant sequence variant_seq = variant_seq[-len(sequence):] xlib.Message.print( 'trace', f'locus_id: {locus_id:8} - variant_seq: >{variant_seq}<\n') # add the haplotype sequence number to the dictionary of haplotype sequence number by locus haplotype_number_by_locus_dict[locus_id] = len( haplotype_seqs_in_locus_dict.keys()) # initialize the list of locus information records locus_line_list = [] # initialize the dictionary of haplotype sequences in the locus haplotype_seqs_in_locus_dict = {} # process a locus information record else: # add the record to the list of locus information records locus_line_list.append(record) # read the next record of complete loci file record = loci_file_id.readline() # close file loci_file_id.close() # get a list of haplotype sequence number by locus sorted by locus identification haplotype_seqs_in_locus_list = sorted( haplotype_number_by_locus_dict.items(), key=operator.itemgetter(1)) # open the statistics file try: print() with open(stats_file_path, mode='w', encoding='iso-8859-1') as stats_file_id: stats_file_id.write('"haplotype number","locus identification"\n') for locus_info in haplotype_seqs_in_locus_list: stats_file_id.write( f'{locus_info[1]},"locus_{locus_info[0]}"\n') except Exception as e: raise xlib.ProgramException(e, 'F001', stats_file_path)
def check_args(args): ''' Verity the input arguments data. ''' # initialize the control variable OK = True # check the assembly_software_code value if args.assembly_software_code is None: xlib.Message.print( 'error', '*** The assembly software that generated the transcritpme file is not indicated in the input arguments.' ) OK = False elif args.assembly_software_code not in [ xlib.Const.AS_TRINITY_CODE, xlib.Const.AS_SOAPDENOVOTRANS_CODE, xlib.Const.AS_GENERATED_BY_NGSCLOUD ]: xlib.Message.print( 'error', f'*** {args.assembly_software_code} is not a valid code of assembly software.' ) OK = False # check the transcriptome_file value if args.transcriptome_file is None: xlib.Message.print( 'error', '*** A transcritpme file in Fasta format is not indicated in the input arguments.' ) OK = False elif not os.path.isfile(args.transcriptome_file): xlib.Message.print( 'error', f'*** The file {args.transcriptome_file} does not exist.') OK = False # check the score_file value if args.score_file is None: xlib.Message.print( 'error', '*** A score file where RSEM-EVAL (DETONATE package) saved the score of the transcriptome file is not indicated in the input arguments.' ) OK = False elif not os.path.isfile(args.score_file): xlib.Message.print('error', f'*** The file {args.score_file} does not exist.') OK = False # check the output_file value if args.output_file is None: xlib.Message.print( 'error', '*** A output file where filtered transcripts will be saved is not indicated in the input arguments.' ) OK = False else: try: if not os.path.exists(os.path.dirname(args.output_file)): os.makedirs(os.path.dirname(args.output_file)) except Exception as e: xlib.Message.print( 'error', f'*** The directory {os.path.dirname(args.output_file)} of the file {args.output_file} is not valid.' ) OK = False # check the minlen value if args.minlen is None: args.minlen = xlib.Const.DEFAULT_MINLEN elif not xlib.check_int(args.minlen, minimum=1): xlib.Message.print( 'error', '*** The minlen has to be a integer number greater than 0.') OK = False else: args.minlen = int(args.minlen) # check the maxlen value if args.maxlen is None: args.maxlen = xlib.Const.DEFAULT_MAXLEN elif not xlib.check_int(args.maxlen, minimum=1): xlib.Message.print( 'error', '*** The maxlen has to be a integer number greater than 0.') OK = False else: args.maxlen = int(args.maxlen) # check the minFPKM value if args.minFPKM is None: args.minFPKM = xlib.Const.DEFAULT_MINFPKM elif not xlib.check_float(args.minFPKM, minimum=0.0): print( '*** FPKM has to be a float number greater than or equal to 0.0.') OK = False else: args.minFPKM = float(args.minFPKM) # check the minTPM value if args.minTPM is None: args.minTPM = xlib.Const.DEFAULT_MINTPM elif not xlib.check_float(args.minTPM, minimum=0.0): print( '*** FPKM has to be a float number greater than or equal to 0.0.') OK = False else: args.minTPM = float(args.minTPM) # check "verbose" if args.verbose is None: args.verbose = xlib.Const.DEFAULT_VERBOSE elif not xlib.check_code( args.verbose, xlib.get_verbose_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** verbose has to be {xlib.get_verbose_code_list_text()}.') OK = False if args.verbose.upper() == 'Y': xlib.Message.set_verbose_status(True) # check "trace" if args.trace is None: args.trace = xlib.Const.DEFAULT_TRACE elif not xlib.check_code( args.trace, xlib.get_trace_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.') OK = False if args.trace.upper() == 'Y': xlib.Message.set_trace_status(True) # check if maxlen value is greater or equal than minlen value if OK: if args.maxlen < args.minlen: xlib.Message.print( 'error', '*** The maxlen value has to be greater than or equal to minlen.' ) OK = False # if there are errors, exit with exception if not OK: raise xlib.ProgramException('', 'P001')
def load_table_ec_ids(conn, ec_id_file): ''' ''' # drop table "ec_ids" (if it exists) xlib.Message.print('verbose', 'Droping the table "ec_ids" ...\n') xsqlite.drop_ec_ids(conn) xlib.Message.print('verbose', 'The table is droped.\n') # create table "ec_ids" xlib.Message.print('verbose', 'Creating the table "ec_ids" ...\n') xsqlite.create_ec_ids(conn) xlib.Message.print('verbose', 'The table is created.\n') # open the EC id file if ec_id_file.endswith('.gz'): try: ec_id_file_id = gzip.open(ec_id_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F002', ec_id_file) else: try: ec_id_file_id = open(ec_id_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F001', ec_id_file) # initialize the record counter record_counter = 0 # initialize the inserted row counter inserted_row_counter = 0 # read the first record record = ec_id_file_id.readline() # while there are records and they are the header while record != '' and not record.startswith('ID'): # add 1 to record counter record_counter += 1 # print record counter xlib.Message.print('verbose', f'\rProcessed records of EC id file: {record_counter} - Inserted rows: {inserted_row_counter}') # read the next record record = ec_id_file_id.readline() # if there is a first definition block if record.startswith('ID'): # while there are records and the record is an identification while record != '': # when the record is an identification if record.startswith('ID'): # add 1 to record counter record_counter += 1 # initialize the row dictionary row_dict = {} row_dict['ec_id'] = record[3:].strip() row_dict['desc'] = '' # print record counter xlib.Message.print('verbose', f'\rProcessed records of EC id file: {record_counter} - Inserted rows: {inserted_row_counter}') # read the next record record = ec_id_file_id.readline() # while there are records and the record is a definition while record != '' and record.startswith('DE'): # add 1 to record counter record_counter += 1 # concat the description if row_dict['desc'] == '': row_dict['desc'] = record[3:].strip() else: row_dict['desc'] = f'''{row_dict['desc']}, {record[3:].strip()}''' # change quotation marks and semicolons in "desc" row_dict['desc'] = row_dict['desc'].replace("'", '|').replace(';', ',') # print record counter xlib.Message.print('verbose', f'\rProcessed records of EC id file: {record_counter} - Inserted rows: {inserted_row_counter}') # read the next record record = ec_id_file_id.readline() # insert data into table "ec_ids" row_dict['desc'] = row_dict['desc'][:-1] xsqlite.insert_ec_ids_row(conn, row_dict) inserted_row_counter += 1 # while there are records and the record is not an identification and is not a definition while record != '' and not record.startswith('ID') and not record.startswith('DE'): # add 1 to record counter record_counter += 1 # print record counter xlib.Message.print('verbose', f'\rProcessed records of EC id file: {record_counter} - Inserted rows: {inserted_row_counter}') # read the next record record = ec_id_file_id.readline() xlib.Message.print('verbose', '\n') # close EC id file ec_id_file_id.close() # create the index on the table "ec_ids" xlib.Message.print('verbose', 'Creating the index on the table "ec_ids" ...\n') xsqlite.create_ec_ids_index(conn) xlib.Message.print('verbose', 'The index is created.\n') # save changes into TOA database xlib.Message.print('verbose', 'Saving changes into TOA database ...\n') conn.commit() xlib.Message.print('verbose', 'Changes are saved.\n')
def load_table_datasets(conn, dataset_file): ''' ''' # initialize the record counter record_counter = 0 # initialize the inserted row counter inserted_row_counter = 0 # set the pattern of the data records # format: "repository_id";"dataset_id";"dataset_name";"ftp_adress" record_pattern = re.compile(r'^"(.*)";"(.*)";"(.*)";"(.*)"$') # drop table "datasets" xlib.Message.print('verbose', 'Droping the table "datasets" ...\n') xsqlite.drop_datasets(conn) xlib.Message.print('verbose', 'The table is droped.\n') # create table "datasets" xlib.Message.print('verbose', 'Creating the table "datasets" ...\n') xsqlite.create_datasets(conn) xlib.Message.print('verbose', 'The table is created.\n') # open the file of datasets if dataset_file.endswith('.gz'): try: dataset_file_id = gzip.open(dataset_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F002', dataset_file) else: try: dataset_file_id = open(dataset_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F001', dataset_file) # read the first record record = dataset_file_id.readline() # while there are records while record != '': # add 1 to record counter record_counter += 1 # process data records if not record.lstrip().startswith('#') and record.strip() != '': # initialize the row data dictionary row_dict = {} # extract data try: mo = record_pattern.match(record) row_dict['dataset_id'] = mo.group(1).strip().lower() row_dict['dataset_name'] = mo.group(2).strip() row_dict['repository_id'] = mo.group(3).strip().lower() row_dict['ftp_adress'] = mo.group(4).strip() except Exception as e: raise xlib.ProgramException('F006', os.path.basename(dataset_file), record_counter) # review null values of "ftp_adress" if row_dict['ftp_adress'] == '': row_dict['ftp_adress'] = xlib.get_na() # insert data into table "datasets" xsqlite.insert_datasets_row(conn, row_dict) inserted_row_counter += 1 # print record counter xlib.Message.print('verbose', f'\rProcessed records of dataset file: {record_counter} - Inserted rows: {inserted_row_counter}') # read the next record record = dataset_file_id.readline() xlib.Message.print('verbose', '\n') # create the index on the table "datasets" xlib.Message.print('verbose', 'Creating the index on the table "datasets" ...\n') xsqlite.create_datasets_index(conn) xlib.Message.print('verbose', 'The index is created.\n') # save changes into TOA database xlib.Message.print('verbose', 'Saving changes into TOA database ...\n') conn.commit() xlib.Message.print('verbose', 'Changes are saved.\n') # close dataset file dataset_file_id.close()
def save_annotation_file_merger_format(annotation_file_1, type_1, merger_file, header): ''' Save a annotation file with record format "PLAZA", "REFSEQ", "NT" or "NR" in record format "MERGER". ''' # open the annotation file if annotation_file_1.endswith('.gz'): try: annotation_file_1_id = gzip.open(annotation_file_1, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F002', annotation_file_1) else: try: annotation_file_1_id = open(annotation_file_1, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F001', annotation_file_1) # open the merger file if merger_file.endswith('.gz'): try: merger_file_id = gzip.open(merger_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException('F004', merger_file) else: try: merger_file_id = open(merger_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException('F003', merger_file) # initialize record counters read_record_counter_1 = 0 written_record_counter = 0 # print header record in merged file if necessary if header == 'Y': xlib.write_annotation_header(merger_file_id, 'MERGER') written_record_counter += 1 # read the first record of the annotation file (record_1, key_1, data_dict_1) = xlib.read_annotation_record(annotation_file_1, annotation_file_1_id, type_1, read_record_counter_1) xlib.Message.print('trace', f'key_1: {key_1} - record_1: {record_1}') # while there are records in annotation file while record_1 != '': # add 1 to record counter read_record_counter_1 += 1 # write in the merged annotation file xlib.write_merged_annotation_record(merger_file_id, type_1, data_dict_1) written_record_counter += 1 xlib.Message.print('verbose', f'\rWritten annotations: {written_record_counter}') # read the next record of the annotation file (record_1, key_1, data_dict_1) = xlib.read_annotation_record(annotation_file_1, annotation_file_1_id, type_1, read_record_counter_1) xlib.Message.print('trace', f'key_1: {key_1} - record_1: {record_1}') # print summary xlib.Message.print('verbose', '\n') xlib.Message.print('info', f'{read_record_counter_1} read records in the annotation file.') xlib.Message.print('info', f'{written_record_counter} written records in the merged annotation file.') # close files annotation_file_1_id.close() merger_file_id.close()
def merge_files_operation_1and2(annotation_file_1, type_1, annotation_file_2, type_2, merger_file, header): ''' Merge annotation files with operation "1AND2" (annotations included in both files). ''' # open the first annotation file if annotation_file_1.endswith('.gz'): try: annotation_file_1_id = gzip.open(annotation_file_1, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F002', annotation_file_1) else: try: annotation_file_1_id = open(annotation_file_1, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F001', annotation_file_1) # open the second annotation file if annotation_file_2.endswith('.gz'): try: annotation_file_2_id = gzip.open(annotation_file_2, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F002', annotation_file_2) else: try: annotation_file_2_id = open(annotation_file_2, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F001', annotation_file_2) # open the merged annotation file if merger_file.endswith('.gz'): try: merger_file_id = gzip.open(merger_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException('F004', merger_file) else: try: merger_file_id = open(merger_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException('F003', merger_file) # initialize record counters read_record_counter_1 = 0 read_record_counter_2 = 0 written_record_counter = 0 # print header record in merged file if necessary if header == 'Y': xlib.write_annotation_header(merger_file_id, 'MERGER') written_record_counter += 1 # read the first record of the first annotation file (record_1, key_1, data_dict_1) = xlib.read_annotation_record(annotation_file_1, annotation_file_1_id, type_1, read_record_counter_1) xlib.Message.print('trace', f'key_1: {key_1} - record_1: {record_1}') # read the first record of the second annotation file (record_2, key_2, data_dict_2) = xlib.read_annotation_record(annotation_file_2, annotation_file_2_id, type_2, read_record_counter_2) xlib.Message.print('trace', f'key_2: {key_2} - record_2: {record_2}') # while there are records in any annotation file while record_1 != '' or record_2 != '': # while there are records in the first annotation file and key of the first annotation file is less then the key of the second annotation file while record_1 != '' and (record_2 != '' and key_1 < key_2 or record_2 == ''): # add 1 to record counter read_record_counter_1 += 1 # write in the merged annotation file xlib.write_merged_annotation_record(merger_file_id, type_1, data_dict_1) written_record_counter += 1 xlib.Message.print('verbose', f'\rWritten annotations: {written_record_counter}') # read the next record of the first annotation file (record_1, key_1, data_dict_1) = xlib.read_annotation_record(annotation_file_1, annotation_file_1_id, type_1, read_record_counter_1) xlib.Message.print('trace', f'key_1: {key_1} - record_1: {record_1}') # while there are records in both annotation files and key of the first annotation file is equal to the key of the second annotation file while record_1 != '' and record_2 != '' and key_1 == key_2: # add 1 to record counter read_record_counter_1 += 1 # write the first file record in the merged annotation file xlib.write_merged_annotation_record(merger_file_id, type_1, data_dict_1) written_record_counter += 1 xlib.Message.print('verbose', f'\rWritten annotations: {written_record_counter}') # read the next record of the first annotation file (record_1, key_1, data_dict_1) = xlib.read_annotation_record(annotation_file_1, annotation_file_1_id, type_1, read_record_counter_1) xlib.Message.print('trace', f'key_1: {key_1} - record_1: {record_1}') # write the second file record in the merged annotation file xlib.write_merged_annotation_record(merger_file_id, type_2, data_dict_2) written_record_counter += 1 xlib.Message.print('verbose', f'\rWritten annotations: {written_record_counter}') # read the next record of the second annotation file (record_2, key_2, data_dict_2) = xlib.read_annotation_record(annotation_file_2, annotation_file_2_id, type_2, read_record_counter_2) xlib.Message.print('trace', f'key_2: {key_2} - record_2: {record_2}') # while there are records in the second annotation file and key of the first annotation file is greater than the key of the second annotation file while record_2 != '' and (record_1 != '' and key_1 > key_2 or record_1 == ''): # add 1 to record counter read_record_counter_2 += 1 # write in the merged annotation file xlib.write_merged_annotation_record(merger_file_id, type_2, data_dict_2) written_record_counter += 1 xlib.Message.print('verbose', f'\rWritten annotations: {written_record_counter}') # read the next record of the second annotation file (record_2, key_2, data_dict_2) = xlib.read_annotation_record(annotation_file_2, annotation_file_2_id, type_2, read_record_counter_2) xlib.Message.print('trace', f'key_2: {key_2} - record_2: {record_2}') # print summary xlib.Message.print('verbose', '\n') xlib.Message.print('info', f'{read_record_counter_1} records read from the first annotation file.') xlib.Message.print('info', f'{read_record_counter_2} records read from the second annotation file.') xlib.Message.print('info', f'{written_record_counter} records written in the merged annotation file.') # close files annotation_file_1_id.close() annotation_file_2_id.close() merger_file_id.close()
def load_table_blast_5(conn, dataset_id, blast_file): ''' ''' # check if BLAST file is not empty try: blast_file_id = open(blast_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F001', blast_file) record = blast_file_id.readline() if record == '': return # initialize the iteration counter iteration_counter = 0 # initialize the inserted row counter inserted_row_counter = 0 # create table "blast" xlib.Message.print( 'verbose', 'Creating the table "blast" (if it does not exist) ...\n') xsqlite.create_blast(conn) xlib.Message.print('verbose', 'The table is created.\n') # create the index on the table "blast" xlib.Message.print( 'verbose', 'Creating the index on the table "blast" (if it does not exist) ...\n') xsqlite.create_blast_index(conn) xlib.Message.print('verbose', 'The index is created.\n') # delete files from table "blast" corresponding to the repository and dataset identification xlib.Message.print('verbose', 'Deleting previous rows from the table "blast" ...\n') xsqlite.delete_blast_rows(conn, dataset_id) xlib.Message.print('verbose', 'Rows are deleted.\n') # build the complee item tree from BLAST XML file tree = xml.etree.ElementTree.parse(blast_file) root = tree.getroot() # walk the tree and insert data into table "blast" for each iteration-hit-hsp for item_blastoutput_iterations in root.iter(tag='BlastOutput_iterations'): xlib.Message.print( 'verbose', f'-> tag: {item_blastoutput_iterations.tag} - attrib: {item_blastoutput_iterations.attrib} - text: {item_blastoutput_iterations.text}\n' ) # get items "Iteration" for item_iteration in item_blastoutput_iterations.iter( tag='Iteration'): xlib.Message.print( 'verbose', f'---> tag: {item_iteration.tag} - attrib: {item_iteration.attrib} - text: {item_iteration.text}\n' ) # initialize the row data dictionary row_dict = {} row_dict['dataset_id'] = dataset_id # add 1 to iteration counter iteration_counter += 1 # initialize iteration data iteration_iter_num = 0 iteration_query_def = '' # get data of item "Iteration_iter-num" for item_iteration_iter_num in item_iteration.iter( tag='Iteration_iter-num'): xlib.Message.print( 'verbose', f'-----> tag: {item_iteration_iter_num.tag} - attrib: {item_iteration_iter_num.attrib} - text: {item_iteration_iter_num.text}\n' ) row_dict['iteration_iter_num'] = int( item_iteration_iter_num.text) # get data of item "Iteration_query-def" for item_iteration_query_def in item_iteration.iter( tag='Iteration_query-def'): xlib.Message.print( 'verbose', f'-----> tag: {item_iteration_query_def.tag} - attrib: {item_iteration_query_def.attrib} - text: {item_iteration_query_def.text}\n' ) row_dict['iteration_query_def'] = item_iteration_query_def.text # get items "Iteration_hits" for item_iteration_hits in item_iteration.iter( tag='Iteration_hits'): xlib.Message.print( 'verbose', f'-----> tag: {item_iteration_hits.tag} - attrib: {item_iteration_hits.attrib} - text: {item_iteration_hits.text}\n' ) # get items "Hit" for item_hit in item_iteration_hits.iter(tag='Hit'): xlib.Message.print( 'verbose', f'-------> tag: {item_hit.tag} - attrib: {item_hit.attrib} - text: {item_hit.text}' ) # initialize hit data row_dict['hit_num'] = 0 row_dict['hit_id'] = xlib.get_na() row_dict['hit_def'] = xlib.get_na() row_dict['hit_accession'] = xlib.get_na() # get data of item "Hit_num" for item_hit_num in item_hit.iter(tag='Hit_num'): xlib.Message.print( 'verbose', f'---------> tag: {item_hit_num.tag} - attrib: {item_hit_num.attrib} - text: {item_hit_num.text}\n' ) row_dict['hit_num'] = int(item_hit_num.text) # get data of item "Hit_id" for item_hit_id in item_hit.iter(tag='Hit_id'): xlib.Message.print( 'verbose', f'---------> tag: {item_hit_id.tag} - attrib: {item_hit_id.attrib} - text: {item_hit_id.text}\n' ) row_dict['hit_id'] = item_hit_id.text # get data of item "Hit_def" for item_hit_def in item_hit.iter(tag='Hit_def'): xlib.Message.print( 'verbose', f'---------> tag: {item_hit_def.tag} - attrib: {item_hit_def.attrib} - text: {item_hit_def.text}\n' ) try: row_dict['hit_def'] = item_hit_def.text.replace( "'", '|').replace(';', ',') except: row_dict['hit_def'] = item_hit_def.text # get data of item "Hit_accession" for item_hit_accession in item_hit.iter( tag='Hit_accession'): xlib.Message.print( 'verbose', f'---------> tag: {item_hit_accession.tag} - attrib: {item_hit_accession.attrib} - text: {item_hit_accession.text}\n' ) row_dict['hit_accession'] = item_hit_accession.text # get items "Hit_hsps" for item_hit_hsps in item_hit.iter(tag='Hit_hsps'): xlib.Message.print( 'verbose', f'---------> tag: {item_hit_hsps.tag} - attrib: {item_hit_hsps.attrib} - text: {item_hit_hsps.text}\n' ) # get items "Hsp" for item_hsp in item_hit.iter(tag='Hsp'): xlib.Message.print( 'verbose', f'-----------> tag: {item_hsp.tag} - attrib: {item_hsp.attrib} - text: {item_hsp.text}\n' ) # initialize hsp data row_dict['hsp_num'] = 0 row_dict['hsp_evalue'] = 0. row_dict['hsp_identity'] = 0 row_dict['hsp_positive'] = 0 row_dict['hsp_gaps'] = 0 row_dict['hsp_align_len'] = 0 row_dict['hsp_qseq'] = '' # get data of item "Hsp_num" for item_hsp_num in item_hsp.iter(tag='Hsp_num'): xlib.Message.print( 'verbose', f'-------------> tag: {item_hsp_num.tag} - attrib: {item_hsp_num.attrib} - text: {item_hsp_num.text}\n' ) row_dict['hsp_num'] = int(item_hsp_num.text) # get data of item "Hsp_evalue" for item_hsp_evalue in item_hsp.iter( tag='Hsp_evalue'): xlib.Message.print( 'verbose', f'-------------> tag: {item_hsp_evalue.tag} - attrib: {item_hsp_evalue.attrib} - text: {item_hsp_evalue.text}\n' ) row_dict['hsp_evalue'] = float( item_hsp_evalue.text) # get data of item "Hsp_identity" for item_hsp_identity in item_hsp.iter( tag='Hsp_identity'): xlib.Message.print( 'verbose', f'-------------> tag: {item_hsp_identity.tag} - attrib: {item_hsp_identity.attrib} - text: {item_hsp_identity.text}\n' ) row_dict['hsp_identity'] = int( item_hsp_identity.text) # get data of item "Hsp_positive" for item_hsp_positive in item_hsp.iter( tag='Hsp_positive'): xlib.Message.print( 'verbose', f'-------------> tag: {item_hsp_positive.tag} - attrib: {item_hsp_positive.attrib} - text: {item_hsp_positive.text}\n' ) row_dict['hsp_positive'] = int( item_hsp_positive.text) # get data of item "Hsp_gaps" for item_hsp_gaps in item_hsp.iter(tag='Hsp_gaps'): xlib.Message.print( 'verbose', f'-------------> tag: {item_hsp_gaps.tag} - attrib: {item_hsp_gaps.attrib} - text: {item_hsp_gaps.text}\n' ) row_dict['hsp_gaps'] = int(item_hsp_gaps.text) # get data of item "Hsp_align-len" for item_hsp_align_len in item_hsp.iter( tag='Hsp_align-len'): xlib.Message.print( 'verbose', f'-------------> tag: {item_hsp_align_len.tag} - attrib: {item_hsp_align_len.attrib} - text: {item_hsp_align_len.text}\n' ) row_dict['hsp_align_len'] = int( item_hsp_align_len.text) # get data of item "Hsp_qseq" for item_hsp_qseq in item_hsp.iter(tag='Hsp_qseq'): xlib.Message.print( 'verbose', f'-------------> tag: {item_hsp_qseq.tag} - attrib: {item_hsp_qseq.attrib} - text: {item_hsp_qseq.text}\n' ) row_dict['hsp_qseq'] = item_hsp_qseq.text # insert data into table "blast" xsqlite.insert_blast_row(conn, row_dict) inserted_row_counter += 1 # print iteration counter xlib.Message.print( 'verbose', f'\rIterations: {iteration_counter} - Inserted rows: {inserted_row_counter}' ) xlib.Message.print('verbose', '\n') # save changes into TOA database xlib.Message.print('verbose', 'Saving changes into TOA database ...\n') conn.commit() xlib.Message.print('verbose', 'Changes are saved.\n')
def check_args(args): ''' Check the input arguments. ''' # initialize the control variable OK = True # check "toa_database" if args.toa_database is None: xlib.Message.print( 'error', '*** The TOA database is not indicated in the input arguments.') OK = False # check "dataset_id" if args.dataset_id is None: xlib.Message.print( 'error', '*** The dataset identification is not indicated in the input arguments.' ) OK = False else: args.dataset_id = args.dataset_id.lower() # check "blast_file_format" if args.blast_file_format is None: xlib.Message.print( 'error', '*** The BLAST file format is not indicated in the input arguments.' ) OK = False elif not xlib.check_code(args.blast_file_format, xlib.get_blast_file_format_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** The BLAST file format has to be {xlib.get_blast_file_format_code_list_text()}.' ) OK = False # check "blast_file" if args.blast_file is None: xlib.Message.print( 'error', '*** The BLAST file is not indicated in the input arguments.') OK = False elif not os.path.isfile(args.blast_file): xlib.Message.print('error', f'*** The file {args.blast_file} does not exist.') OK = False # check "verbose" if args.verbose is None: args.verbose = xlib.Const.DEFAULT_VERBOSE elif not xlib.check_code( args.verbose, xlib.get_verbose_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** verbose has to be {xlib.get_verbose_code_list_text()}.') OK = False if args.verbose.upper() == 'Y': xlib.Message.set_verbose_status(True) # check "trace" if args.trace is None: args.trace = xlib.Const.DEFAULT_TRACE elif not xlib.check_code( args.trace, xlib.get_trace_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.') OK = False if args.trace.upper() == 'Y': xlib.Message.set_trace_status(True) # if there are errors, exit with exception if not OK: raise xlib.ProgramException('P001')
def extract_annotations(annotation_file, type, id_file, extract_file, stats_file): ''' ''' # get the identification data (id_list, id_dict) = get_id_data(id_file) # open the annotation file if annotation_file.endswith('.gz'): try: annotation_file_id = gzip.open(annotation_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F002', annotation_file) else: try: annotation_file_id = open(annotation_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F001', annotation_file) # open the extracted identification file if extract_file.endswith('.gz'): try: extract_file_id = gzip.open(extract_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException('F004', extract_file) else: try: extract_file_id = open(extract_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException('F003', extract_file) # initialize record counters read_record_counter = 0 written_record_counter = 0 # write header record in the extracted identification file xlib.write_annotation_header(extract_file_id, type) written_record_counter += 1 # read the first record of the annotation file (header) read_record_counter += 1 (record, key, data_dict) = xlib.read_annotation_record(annotation_file, annotation_file_id, type, read_record_counter) xlib.Message.print('trace', f'key: {key} - record: {record}') # while there are records while record != '': # get the identification of the current record id = key # this sentence block is only used in a particular case if key.startswith('CUFF'): first_dot_position = key.find('.') second_dot_position = key.find('.', first_dot_position + 1) id = key[:second_dot_position] elif key.startswith('scaffold'): id = key[:key.find(' ')] # if the key is in the identification list if id in id_list: # add 1 to the annotation counter of the identification id_dict[id] += 1 # write in the extracted identification file xlib.write_merged_annotation_record(extract_file_id, type, data_dict) written_record_counter += 1 xlib.Message.print( 'verbose', f'\rRead annotations: {read_record_counter} - Written annotations: {written_record_counter}' ) # read the next record of the annotation file read_record_counter += 1 (record, key, data_dict) = xlib.read_annotation_record(annotation_file, annotation_file_id, type, read_record_counter) xlib.Message.print('trace', f'key: {key} - record: {record}') xlib.Message.print('verbose', '\n') # print summary xlib.Message.print( 'info', f'{read_record_counter - 1} annotations read in annotation file.') xlib.Message.print( 'info', f'{written_record_counter} annotations written in the extracted identification file.' ) # close files annotation_file_id.close() extract_file_id.close() # write stats write_stats(stats_file, id_list, id_dict)
def check_args(args): ''' Check the input arguments. ''' # initialize the control variable OK = True # check "annotation_file" if args.annotation_file is None: xlib.Message.print( 'error', '*** The annotation file is not indicated in the input arguments.') OK = False elif not os.path.isfile(args.annotation_file): xlib.Message.print( 'error', f'*** The file {args.annotation_file} does not exist.') OK = False # check "type" if args.type is None: xlib.Message.print( 'error', '*** The type of annotation file is not indicated in the input arguments.' ) OK = False elif not xlib.check_code( args.type, xlib.get_type_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** The type of annotation file has to be {xlib.get_type_code_list_text()}.' ) OK = False else: args.type = args.type.upper() # check "id_file" if args.id_file is None: xlib.Message.print( 'error', '*** The identification file is not indicated in the input arguments.' ) OK = False elif not os.path.isfile(args.id_file): xlib.Message.print('error', f'*** The file {args.id_file} does not exist.') OK = False # check "extract_file" if args.extract_file is None: xlib.Message.print( 'error', '*** The extracted annotation file is not indicated in the input arguments.' ) OK = False # check "stats_file" if args.stats_file is None: xlib.Message.print( 'error', '*** The statistics file is not indicated in the input arguments.') OK = False # check "verbose" if args.verbose is None: args.verbose = xlib.Const.DEFAULT_VERBOSE elif not xlib.check_code( args.verbose, xlib.get_verbose_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** verbose has to be {xlib.get_verbose_code_list_text()}.') OK = False if args.verbose.upper() == 'Y': xlib.Message.set_verbose_status(True) # check "trace" if args.trace is None: args.trace = xlib.Const.DEFAULT_TRACE elif not xlib.check_code( args.trace, xlib.get_trace_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.') OK = False if args.trace.upper() == 'Y': xlib.Message.set_trace_status(True) # if there are errors, exit with exception if not OK: raise xlib.ProgramException('P001')
def form_set_environment(): ''' Set the environment. ''' # print headers clib.clear_screen() clib.print_headers_without_environment('Set environment') # -- print('function name: {0}'.format(sys._getframe().f_code.co_name)) # initialize the environment and the input environment xconfiguration.environment = '' environment = '' # get the current environments list environments_list = xconfiguration.get_environments_list() # print the available region names if environments_list != []: print('Current environments list: {0} ...'.format(str(environments_list).strip('[]').replace('\'', ''))) input_text = '... Enter the environment name: ' else: print('Currently there is not any environment recorded.') input_text = 'Enter a new environment name: ' # input and validate the environment while xconfiguration.environment == '': xconfiguration.environment = input(input_text) if xconfiguration.environment not in environments_list: print(xlib.get_separator()) anwser = input('{0} is not a recorded environment. Do you like to record it? (Y/N): '.format(xconfiguration.environment)) if anwser not in ['Y', 'y']: xconfiguration.environment = '' else: (OK, error_list) = xconfiguration.add_environment(xconfiguration.environment) if not OK: for error in error_list: print(error) raise xlib.ProgramException('C002') # check if it is necesary to create the NGScloud config file corresponding to the environment if not xconfiguration.is_ngscloud_config_file_created(): print(xlib.get_separator()) print('Creating the config files ...') # create the NGScloud config file form_create_ngscloud_config_file(is_menu_call=False) # create the key pairs directory if not os.path.exists(xlib.get_keypairs_dir()): os.makedirs(xlib.get_keypairs_dir()) # create the BUSCO config file (OK, error_list) = xbusco.create_busco_config_file() # create the CD-HIT-EST config file (OK, error_list) = xcdhit.create_cd_hit_est_config_file() # create the FastQC config file (OK, error_list) = xfastqc.create_fastqc_config_file() # create the GMAP config file (OK, error_list) = xgmap.create_gmap_config_file() # create the insilico_read_normalization config file (OK, error_list) = xtrinity.create_insilico_read_normalization_config_file() # create the QUAST config file (OK, error_list) = xquast.create_quast_config_file() # create the REF-EVAL config file (OK, error_list) = xdetonate.create_ref_eval_config_file() # create the rnaQUAST config file (OK, error_list) = xrnaquast.create_rnaquast_config_file() # create the RSEM-EVAL config file (OK, error_list) = xdetonate.create_rsem_eval_config_file() # create the SOAPdenovo-Trans config file (OK, error_list) = xsoapdenovotrans.create_soapdenovotrans_config_file() # create the STAR config file (OK, error_list) = xstar.create_star_config_file() # create the Trans-ABySS config file (OK, error_list) = xtransabyss.create_transabyss_config_file() # create the transcript-filter config file (OK, error_list) = xngshelper.create_transcript_filter_config_file() # create the transcriptome-blastx config file (OK, error_list) = xngshelper.create_transcriptome_blastx_config_file() # create the Transrate config file (OK, error_list) = xtransrate.create_transrate_config_file() # create the Trimmomatic config file (OK, error_list) = xtrimmomatic.create_trimmomatic_config_file() # create the Trinity config file (OK, error_list) = xtrinity.create_trinity_config_file() # create the transfer config files (OK, error_list) = xreference.create_reference_transfer_config_file() (OK, error_list) = xdatabase.create_database_transfer_config_file() (OK, error_list) = xread.create_read_transfer_config_file() (OK, error_list) = xresult.create_result_transfer_config_file(status='uncompressed') # create the gzip config files (OK, error_list) = xgzip.create_gzip_config_file(dataset_type='reference') (OK, error_list) = xgzip.create_gzip_config_file(dataset_type='database') (OK, error_list) = xgzip.create_gzip_config_file(dataset_type='read') (OK, error_list) = xgzip.create_gzip_config_file(dataset_type='result') # set the environment variables corresponding to the NGScloud config file, the AWS access key identification, # AWS secret access key and the current region name print(xlib.get_separator()) print('Setting the environment variables ...') xconfiguration.set_environment_variables() print('The environment variables are set.') # show continuation message print(xlib.get_separator()) input('Press [Intro] to continue ...')
def check_args(args): ''' Check the input arguments. ''' # initialize the control variable OK = True # check "ngshelper_database" if args.ngshelper_database is None: xlib.Message.print('error', '*** The NGShelper database is not indicated in the input arguments.') OK = False # check "vcf_file" if args.vcf_file is None: xlib.Message.print('error', '*** The VCF file is not indicated in the input arguments.') OK = False elif not os.path.isfile(args.vcf_file): xlib.Message.print('error', f'*** The file {args.vcf_file} does not exist.') OK = False # check "sample_file" if args.sample_file is None: xlib.Message.print('error', '*** The sample file is not indicated in the input arguments.') OK = False elif not os.path.isfile(args.sample_file): xlib.Message.print('error', f'*** The file {args.sample_file} does not exist.') OK = False # check "sp1_id" if args.sp1_id is None: xlib.Message.print('error', '*** The identification of the first species is not indicated in the input arguments.') OK = False # check "sp2_id" if args.sp2_id is None: xlib.Message.print('error', '*** The identification of the second species is not indicated in the input arguments.') OK = False # check "hybrid_id" if args.hybrid_id is None: args.hybrid_id = 'NONE' # check "imputed_md_id" if args.imputed_md_id is None: args.imputed_md_id = xlib.Const.DEFAULT_IMPUTED_MD_ID # check "new_md_id" if args.new_md_id is None: args.new_md_id = xlib.Const.DEFAULT_NEW_MD_ID # check "allele_transformation" if args.allele_transformation is None: args.allele_transformation = 'NONE' elif not xlib.check_code(args.allele_transformation, xlib.get_allele_transformation_code_list(), case_sensitive=False): xlib.Message.print('error', f'*** The allele transformation has to be {xlib.get_allele_transformation_code_list_text()}.') OK = False else: args.allele_transformation = args.allele_transformation.upper() # check "verbose" if args.verbose is None: args.verbose = xlib.Const.DEFAULT_VERBOSE elif not xlib.check_code(args.verbose, xlib.get_verbose_code_list(), case_sensitive=False): xlib.Message.print('error', f'*** verbose has to be {xlib.get_verbose_code_list_text()}.') OK = False if args.verbose.upper() == 'Y': xlib.Message.set_verbose_status(True) # check "trace" if args.trace is None: args.trace = xlib.Const.DEFAULT_TRACE elif not xlib.check_code(args.trace, xlib.get_trace_code_list(), case_sensitive=False): xlib.Message.print('error', f'*** trace has to be {xlib.get_trace_code_list_text()}.') OK = False if args.trace.upper() == 'Y': xlib.Message.set_trace_status(True) # check "tvi_list" if args.tvi_list is None or args.tvi_list == 'NONE': args.tvi_list = [] else: args.tvi_list = xlib.split_literal_to_string_list(args.tvi_list) # check the identification set if OK: if args.sp1_id == args.sp2_id or \ args.hybrid_id is not None and (args.sp1_id == args.hybrid_id or args.sp2_id == args.hybrid_id): xlib.Message.print('error', 'The identifications must be different.') OK = False # if there are errors, exit with exception if not OK: raise xlib.ProgramException('', 'P001')
def get_ko_annotations(transcripts_with_ko_file, annotation_dict): ''' ''' # initialize the record counter record_counter = 0 # open the transcripts with KO file if transcripts_with_ko_file.endswith('.gz'): try: transcripts_with_ko_file_id = gzip.open(transcripts_with_ko_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', transcripts_with_ko_file) else: try: transcripts_with_ko_file_id = open(transcripts_with_ko_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', transcripts_with_ko_file) # read the first record record = transcripts_with_ko_file_id.readline() # while there are records while record != '': # add 1 to record counter record_counter += 1 # process data records if not record.startswith('#'): # extract data # record format: counter transcript_id ko description data_list = [] begin = 0 for end in [i for i, chr in enumerate(record) if chr == '\t']: data_list.append(record[begin:end].strip()) begin = end + 1 data_list.append(record[begin:].strip('\n').strip()) try: transcript_id = data_list[1] ko = data_list[2] description = data_list[3] except Exception as e: raise xlib.ProgramException(e, 'F006', os.path.basename(transcripts_with_ko_file), record_counter) # change quotation marks in "description" description = description.replace("'", '|') # insert data into annotation dictionary go_id_w = annotation_dict.get(transcript_id, {}).get('go_id', '') go_desc_w = annotation_dict.get(transcript_id, {}).get('go_desc', '') gf_id_w = annotation_dict.get(transcript_id, {}).get('gf_id', '') ko_id_w = annotation_dict.get(transcript_id, {}).get('ko_id', '') ko_id_w = ko if ko_id_w == '' else f'{ko_id_w}*{ko}' ko_desc_w = annotation_dict.get(transcript_id, {}).get('ko_desc', '') ko_desc_w = description if ko_desc_w == '' else f'{ko_desc_w}*{description}' annotation_dict[transcript_id] = {'go_id': go_id_w, 'go_desc': go_desc_w, 'gf_id': gf_id_w, 'ko_id': ko_id_w, 'ko_desc': ko_desc_w} # print counters xlib.Message.print('verbose', f'\rProcessed records of transcripts with KO file: {record_counter}') # read the next record record = transcripts_with_ko_file_id.readline() xlib.Message.print('verbose', '\n') # close transcripts with KO file transcripts_with_ko_file_id.close() # return the annotation dictionary return annotation_dict
def convert_simhyb_to_structure(simhyb_file, header_row_number, structure_file): ''' Convert a output SimHyb file to the input Structure format in two lines. ''' # initialize the loci number loci_number = -1 # open the SimHyb file if simhyb_file.endswith('.gz'): try: simhyb_file_id = gzip.open(simhyb_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', simhyb_file) else: try: simhyb_file_id = open(simhyb_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', simhyb_file) # open the Structure file if structure_file.endswith('.gz'): try: structure_file_id = gzip.open(structure_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F004', structure_file) else: try: structure_file_id = open(structure_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', structure_file) # initialize record counters input_record_counter = 0 written_record_counter = 0 # read the first record of the SimHyb file record = simhyb_file_id.readline() # while there are records in the VCF file while record != '': # add 1 to input record counter input_record_counter += 1 # when the record has data if input_record_counter > header_row_number: # extract data data_list = [] start = 0 for end in [i for i, chr in enumerate(record) if chr == '\t']: data_list.append(record[start:end].strip()) start = end + 1 last_data = record[start:].strip('\n').strip() if last_data != '': data_list.append(record[start:].strip('\n').strip()) # check the loci number if loci_number == -1: loci_number = len(data_list) - 12 if (loci_number % 2) == 1: raise xlib.ProgramException('', 'L011') elif loci_number != len(data_list) - 12: raise xlib.ProgramException('', 'L012') # get left and righ genotype lists of loci gt_left_list = [] gt_right_list = [] for i in range(12, len(data_list)): if (i % 2) == 0: gt_left_list.append(data_list[i]) else: gt_right_list.append(data_list[i]) # write the record corresponding to the left genotype list gt_left_list_text = '\t'.join(gt_left_list) structure_file_id.write( f'{data_list[0]}\t{data_list[1]}\t{gt_left_list_text}\n') written_record_counter += 1 # write the record corresponding to the right genotype list gt_right_list_text = '\t'.join(gt_right_list) structure_file_id.write( f'{data_list[0]}\t{data_list[1]}\t{gt_right_list_text}\n') written_record_counter += 1 # print the counters xlib.Message.print( 'verbose', f'\rProcessed SimHyb records ... {input_record_counter:8d} - Written Structure records ... {written_record_counter:8d}' ) # read the next record of the SimHyb file record = simhyb_file_id.readline() xlib.Message.print('verbose', '\n') # close file simhyb_file_id.close() structure_file_id.close() # print OK message xlib.Message.print( 'info', f'The converted file {os.path.basename(structure_file)} is created.')
def merge_files_operation_1best(annotation_file_1, type_1, annotation_file_2, type_2, merger_file, header): ''' Merge annotation files with operation "1BEST" (all annotations of the first file and annotations if the second file if their seq id is not in the first). ''' # open the first annotation file if annotation_file_1.endswith('.gz'): try: annotation_file_1_id = gzip.open(annotation_file_1, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F002', annotation_file_1) else: try: annotation_file_1_id = open(annotation_file_1, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F001', annotation_file_1) # open the second annotation file if annotation_file_2.endswith('.gz'): try: annotation_file_2_id = gzip.open(annotation_file_2, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F002', annotation_file_2) else: try: annotation_file_2_id = open(annotation_file_2, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F001', annotation_file_2) # open the merged annotation file if merger_file.endswith('.gz'): try: merger_file_id = gzip.open(merger_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException('F004', merger_file) else: try: merger_file_id = open(merger_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException('F003', merger_file) # initialize record counters read_record_counter_1 = 0 read_record_counter_2 = 0 written_record_counter = 0 # print header record in merged file if necessary if header == 'Y': xlib.write_annotation_header(merger_file_id, 'MERGER') written_record_counter += 1 # read the first record of the first annotation file (record_1, key_1, data_dict_1) = xlib.read_annotation_record(annotation_file_1, annotation_file_1_id, type_1, read_record_counter_1) xlib.Message.print('trace', f'key_1: {key_1} - record_1: {record_1}') # read the first record of the second annotation file (record_2, key_2, data_dict_2) = xlib.read_annotation_record(annotation_file_2, annotation_file_2_id, type_2, read_record_counter_2) xlib.Message.print('trace', f'key_2: {key_2} - record_2: {record_2}') # while there are records in any annotation file # (the first compound of the key, the sequence identification of transcripts nt_seq_id, is only considered in this processing) while record_1 != '' or record_2 != '': # while there are records in the first annotation file and key of the first annotation file is less then the key of the second annotation file while record_1 != '' and (record_2 != '' and data_dict_1['nt_seq_id'] < data_dict_2['nt_seq_id'] or record_2 == ''): # add 1 to record counter read_record_counter_1 += 1 # write in the merged annotation file xlib.write_merged_annotation_record(merger_file_id, type_1, data_dict_1) written_record_counter += 1 xlib.Message.print('verbose', f'\rWritten annotations: {written_record_counter}') # read the next record of the first annotation file (record_1, key_1, data_dict_1) = xlib.read_annotation_record(annotation_file_1, annotation_file_1_id, type_1, read_record_counter_1) xlib.Message.print('trace', f'key_1: {key_1} - record_1: {record_1}') # while there are records in the first annotation file and key of the first annotation file is equal to the key of the second annotation file while record_1 != '' and record_2 != '' and data_dict_1['nt_seq_id'] == data_dict_2['nt_seq_id']: # add 1 to record counter read_record_counter_1 += 1 # write in the merged annotation file xlib.write_merged_annotation_record(merger_file_id, type_1, data_dict_1) written_record_counter += 1 xlib.Message.print('verbose', f'\rWritten annotations: {written_record_counter}') # read next records of the second annotation file while their key is equal to the key of the first annotation file while record_2 != '' and data_dict_1['nt_seq_id'] == data_dict_2['nt_seq_id']: (record_2, key_2, data_dict_2) = xlib.read_annotation_record(annotation_file_2, annotation_file_2_id, type_2, read_record_counter_2) xlib.Message.print('trace', f'key_2: {key_2} - record_2: {record_2}') # read the next record of the first annotation file (record_1, key_1, data_dict_1) = xlib.read_annotation_record(annotation_file_1, annotation_file_1_id, type_1, read_record_counter_1) xlib.Message.print('trace', f'key_1: {key_1} - record_1: {record_1}') # while there are records in the second annotation file and key of the first annotation file is greater than the key of the second annotation file while record_2 != '' and (record_1 != '' and data_dict_1['nt_seq_id'] > data_dict_2['nt_seq_id'] or record_1 == ''): # add 1 to record counter read_record_counter_2 += 1 # write in the merged annotation file xlib.write_merged_annotation_record(merger_file_id, type_2, data_dict_2) written_record_counter += 1 xlib.Message.print('verbose', f'\rWritten annotations: {written_record_counter}') # read the next record of the second annotation file (record_2, key_2, data_dict_2) = xlib.read_annotation_record(annotation_file_2, annotation_file_2_id, type_2, read_record_counter_2) xlib.Message.print('trace', f'key_2: {key_2} - record_2: {record_2}') # print summary xlib.Message.print('verbose', '\n') xlib.Message.print('info', f'{read_record_counter_1} records read from the first annotation file.') xlib.Message.print('info', f'{read_record_counter_2} records read from the second annotation file.') xlib.Message.print('info', f'{written_record_counter} records written in the merged annotation file.') # close files annotation_file_1_id.close() annotation_file_2_id.close() merger_file_id.close()
def check_args(args): ''' Check the input arguments. ''' # initialize the control variable OK = True # check "simhyb_file" if args.simhyb_file is None: xlib.Message.print( 'error', '*** The SimHyb file is not indicated in the input arguments.') OK = False elif not os.path.isfile(args.simhyb_file): xlib.Message.print('error', f'*** The file {args.simhyb_file} does not exist.') OK = False # check "header_row_number" if args.header_row_number is None: xlib.Message.print( 'error', '*** The header row number in the SimHyb file is not indicated in the input arguments.' ) OK = False elif not xlib.check_int(args.header_row_number, minimum=0): xlib.Message.print( 'error', 'The header row number in the SimHyb file has to be an integer number greater than or equalt to 0.' ) OK = False else: args.header_row_number = int(args.header_row_number) # check "structure_file" if args.structure_file is None: xlib.Message.print( 'error', '*** The converted Structure file is not indicated in the input arguments.' ) OK = False # check "verbose" if args.verbose is None: args.verbose = xlib.Const.DEFAULT_VERBOSE elif not xlib.check_code( args.verbose, xlib.get_verbose_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** verbose has to be {xlib.get_verbose_code_list_text()}.') OK = False if args.verbose.upper() == 'Y': xlib.Message.set_verbose_status(True) # check "trace" if args.trace is None: args.trace = xlib.Const.DEFAULT_TRACE elif not xlib.check_code( args.trace, xlib.get_trace_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.') OK = False if args.trace.upper() == 'Y': xlib.Message.set_trace_status(True) # if there are errors, exit with exception if not OK: raise xlib.ProgramException('', 'P001')
def check_args(args): ''' Check the input arguments. ''' # initialize the control variable OK = True # check "annotation_file_1" if args.annotation_file_1 is None: xlib.Message.print('error', '*** The first annotation file is not indicated in the input arguments.') OK = False elif not os.path.isfile(args.annotation_file_1): xlib.Message.print('error', f'*** The file {args.annotation_file_1} does not exist.') OK = False # check "type_1" if args.type_1 is None: xlib.Message.print('error', '*** The type of first annotation file is not indicated in the input arguments.') OK = False elif not xlib.check_code(args.type_1, xlib.get_type_code_list(), case_sensitive=False): xlib.Message.print('error', f'*** The type of annotation file has to be {xlib.get_type_code_list_text()}.') OK = False else: args.type_1 = args.type_1.upper() # check "annotation_file_2" if args.annotation_file_2 is None: xlib.Message.print('error', '*** The second annotation file is not indicated in the input arguments.') OK = False elif args.annotation_file_2.upper() == 'NONE': args.annotation_file_2 = args.annotation_file_2.upper() elif not os.path.isfile(args.annotation_file_2): xlib.Message.print('error', f'*** The file {args.annotation_file_2} does not exist.') OK = False # check "type_2" if args.type_2 is None: xlib.Message.print('error', '*** The format of second annotation file is not indicated in the input arguments.') OK = False elif args.type_2.upper() == 'NONE' and args.annotation_file_2 != 'NONE': xlib.Message.print('error', '*** The format of second annotation file has to be NONE if the second annotation file is NONE') OK = False elif args.type_2.upper() == 'NONE' and args.annotation_file_2 == 'NONE': args.type_2 = args.type_2.upper() elif not xlib.check_code(args.type_2, xlib.get_type_code_list(), case_sensitive=False): xlib.Message.print('error', f'*** The type of annotation file has to be {xlib.get_type_code_list_text()}.') OK = False else: args.type_2 = args.type_2.upper() # check "merger_file" if args.merger_file is None: xlib.Message.print('error', '*** The merged file is not indicated in the input arguments.') OK = False # check "merger_operation" if args.merger_operation is None: xlib.Message.print('error', '*** The merger operation is not indicated in the input arguments.') OK = False elif args.merger_operation.upper() == 'SAVE1' and args.annotation_file_2 != 'NONE': xlib.Message.print('error', '*** The merger operation SAVE1 is only valid when the second annotation file is NONE.') OK = False elif args.merger_operation.upper() != 'SAVE1' and not xlib.check_code(args.merger_operation, xlib.get_annotation_merger_operation_code_list(), case_sensitive=False) : xlib.Message.print('error', f'*** The merger operation has to be {xlib.get_annotation_merger_operation_code_list_text()}.') OK = False else: args.merger_operation = args.merger_operation.upper() # check "header" if args.header is None: args.header = xlib.Const.DEFAULT_HEADER elif not xlib.check_code(args.header, xlib.get_header_code_list(), case_sensitive=False): xlib.Message.print('error', f'*** header has to be {xlib.get_header_code_list_text()}.') OK = False else: args.header = args.header.upper() # check "verbose" if args.verbose is None: args.verbose = xlib.Const.DEFAULT_VERBOSE elif not xlib.check_code(args.verbose, xlib.get_verbose_code_list(), case_sensitive=False): xlib.Message.print('error', f'*** verbose has to be {xlib.get_verbose_code_list_text()}.') OK = False if args.verbose.upper() == 'Y': xlib.Message.set_verbose_status(True) # check "trace" if args.trace is None: args.trace = xlib.Const.DEFAULT_TRACE elif not xlib.check_code(args.trace, xlib.get_trace_code_list(), case_sensitive=False): xlib.Message.print('error', f'*** trace has to be {xlib.get_trace_code_list_text()}.') OK = False if args.trace.upper() == 'Y': xlib.Message.set_trace_status(True) # if there are errors, exit with exception if not OK: raise xlib.ProgramException('P001')
def build_nexus_file(selection_loci_id_file_path, complete_loci_file_path, selected_loci_file_path, nexus_file_path): ''' Build a Nexus file from a ypirad loci file for a determinated loci set. ''' # initialize the selected loci id list selected_loci_id_list = [] # load the selected loci ids and set the selected loci id list try: with open(selection_loci_id_file_path) as selected_loci_ids_file_id: for record in selected_loci_ids_file_id: selected_loci_id_list.append(record[6:].rstrip()) except Exception as e: raise xlib.ProgramException(e, 'F001', selection_loci_id_file_path) xlib.Message.print('trace', f'selected_loci_id_list: {selected_loci_id_list}\n') # open the complete loci file try: complete_loci_file_id = open(complete_loci_file_path, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', complete_loci_file_path) # open the selected loci file try: selected_loci_file_id = open(selected_loci_file_path, mode='w', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', selected_loci_file_path) # set the pattern of the locus id records pattern1 = r'^\/\/(.*)\|(.*)\|$' # set the pattern of the locus information records pattern2 = r'^(.*) (.*)$' # initialize the list of locus information records locus_line_list = [] # initialize the sequence locus lenght list seq_locus_lenght_list = [] # initialize the taxon id list taxon_id_list = [] # initialize the base count base_count = 0 # read the first record of complete loci file record = complete_loci_file_id.readline() # while there are records while record != '': # process the locus id record if record.startswith('//'): # extract the locus id mo = re.search(pattern1, record) locus_id = mo.group(2) # when the locus id is a selected locus, write locus information in the selected loci file if locus_id in selected_loci_id_list: for i in range(len(locus_line_list)): # extract the taxon id and sequence mo = re.search(pattern2, locus_line_list[i]) taxon_id = mo.group(1).strip() sequence = mo.group(2).strip() # add the taxon id to taxon id list if taxon_id not in taxon_id_list: taxon_id_list.append(taxon_id) # when the first taxon if i == 0: # add the sequence length to the base count base_count += len(sequence) # add the sequence length to the sequence locus lenght list to the first taxon found seq_locus_lenght_list.append(len(sequence)) # write the line to the selected loci file selected_loci_file_id.write(locus_line_list[i]) # write the locus id record to the selected loci file selected_loci_file_id.write(record) # initialize the list of locus information records locus_line_list = [] # process a locus information record else: # add the record to the list of locus information records locus_line_list.append(record) # read the next record of complete loci file record = complete_loci_file_id.readline() # sort the taxon id list taxon_id_list.sort() xlib.Message.print('trace', f'taxon_id_list: {taxon_id_list}\n') # close files complete_loci_file_id.close() selected_loci_file_id.close() # initialize the dictionary of locus information records locus_line_dict = {} # open the selected loci file try: selected_loci_file_id = open(selected_loci_file_path, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', selected_loci_file_path) # open the Nexus file try: nexus_file_id = open(nexus_file_path, mode='w', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', nexus_file_path) # write the head records in Nexus file nexus_file_id.write('#nexus\n') nexus_file_id.write('begin data;\n') nexus_file_id.write( f' dimensions ntax={len(taxon_id_list)} nchar={base_count};\n') nexus_file_id.write(' format datatype=DNA interleave=yes gap=-;\n') nexus_file_id.write(' matrix\n') # read the first record of the selected loci file record = selected_loci_file_id.readline() # while there are records while record != '': # process the locus id record if record.startswith('//'): # get the sequence length of a taxon sequence_len = len(locus_line_dict[list( locus_line_dict.keys())[0]]) # for each taxon, write its information for taxon_id in taxon_id_list: # get the taxon sequence sequence = locus_line_dict.get(taxon_id, 'N' * sequence_len) # write the locus information record in the Nexus file nexus_file_id.write(f' {taxon_id:30} {sequence}\n') # write a blank line in the Nexus file nexus_file_id.write('\n') # initialize the dictionary of locus information records locus_line_dict = {} # process a locus information record else: # extract the taxon id and sequence mo = re.search(pattern2, record) taxon_id = mo.group(1).strip() sequence = mo.group(2).strip() # add the record to the dictionary of locus information records locus_line_dict[taxon_id] = sequence # read the next record of selected loci file record = selected_loci_file_id.readline() # write the tail records in Nexus file nexus_file_id.write(' ;\n') nexus_file_id.write('end;\n') nexus_file_id.write('begin assumptions;\n') start_position = 1 for i in range(len(seq_locus_lenght_list)): end_position = start_position + seq_locus_lenght_list[i] nexus_file_id.write( f' charset locus_{i + 1} = {start_position}-{end_position - 1};\n' ) start_position = end_position nexus_file_id.write('end;\n') # close files selected_loci_file_id.close() nexus_file_id.close()
def load_table_species(conn, species_file): ''' ''' # drop table "species" (if it exists) xlib.Message.print('verbose', 'Droping the table "species" ...\n') xsqlite.drop_species(conn) xlib.Message.print('verbose', 'The table is droped.\n') # create table "species" xlib.Message.print('verbose', 'Creating the table "species" ...\n') xsqlite.create_species(conn) xlib.Message.print('verbose', 'The table is created.\n') # open the file of species data if species_file.endswith('.gz'): try: species_file_id = gzip.open(species_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F002', species_file) else: try: species_file_id = open(species_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F001', species_file) # set the pattern of the data records # format: "species_name";"plaza_id" record_pattern = re.compile(r'^"(.*)";"(.*)"$') # initialize the record counter record_counter = 0 # initialize the inserted row counter inserted_row_counter = 0 # read the first record record = species_file_id.readline() # while there are records while record != '': # add 1 to record counter record_counter += 1 # process data records if not record.lstrip().startswith('#') and record.strip() != '': # initialize the row data dictionary row_dict = {} # extract data try: mo = record_pattern.match(record) row_dict['species_name'] = mo.group(1).strip().capitalize() row_dict['plaza_species_id'] = mo.group(2).strip().lower() except Exception as e: raise xlib.ProgramException('F006', os.path.basename(species_file), record_counter) # get the taxonomy dictionary of the species name from taxonomy server taxonomy_dict = xlib.get_taxonomy_dict('name', row_dict['species_name']) if taxonomy_dict == {}: row_dict['family_name'] = xlib.get_na() row_dict['phylum_name'] = xlib.get_na() row_dict['kingdom_name'] = xlib.get_na() row_dict['superkingdom_name'] = xlib.get_na() row_dict['tax_id'] = xlib.get_na() else: row_dict['family_name'] = taxonomy_dict['family']['name'] row_dict['phylum_name'] = taxonomy_dict['phylum']['name'] row_dict['kingdom_name'] = taxonomy_dict['kingdom']['name'] row_dict['superkingdom_name'] = taxonomy_dict['superkingdom']['name'] row_dict['tax_id'] = taxonomy_dict['tax_id'] # insert data into table species xsqlite.insert_species_row(conn, row_dict) inserted_row_counter += 1 # print record counter xlib.Message.print('verbose', f'\rProcessed records of species file: {record_counter} - Inserted rows: {inserted_row_counter}') # read the next record record = species_file_id.readline() xlib.Message.print('verbose', '\n') # create the index on the table "species" xlib.Message.print('verbose', 'Creating the index on the table "species" ...\n') xsqlite.create_species_index(conn) xlib.Message.print('verbose', 'The index is created.\n') # save changes into TOA database xlib.Message.print('verbose', 'Saving changes into TOA database ...\n') conn.commit() xlib.Message.print('verbose', 'Changes are saved.\n') # close species file species_file_id.close()
def check_args(args): ''' Check the input arguments. ''' # initialize the control variable OK = True # check "vcf_file" if args.vcf_file is None: xlib.Message.print( 'error', '*** The VCF file is not indicated in the input arguments.') OK = False elif not os.path.isfile(args.vcf_file): xlib.Message.print('error', f'*** The file {args.vcf_file} does not exist.') OK = False # check "sample_file" if args.sample_file is None: xlib.Message.print( 'error', '*** The sample file is not indicated in the input arguments.') OK = False elif not os.path.isfile(args.sample_file): xlib.Message.print('error', f'*** The file {args.sample_file} does not exist.') OK = False # check "sp1_id" if args.sp1_id is None: xlib.Message.print( 'error', '*** The identification of the first species is not indicated in the input arguments.' ) OK = False # check "sp2_id" if args.sp2_id is None: xlib.Message.print( 'error', '*** The identification of the second species is not indicated in the input arguments.' ) OK = False # check "hybrid_id" if args.hybrid_id is None: args.hybrid_id = 'NONE' # check "output_dir" if args.output_dir is None: xlib.Message.print( 'error', '*** The output directy is not indicated in the input arguments.') OK = False elif not os.path.isdir(args.output_dir): xlib.Message.print('error', '*** The output directy does not exist.') OK = False # check "variant_number_per_file" if args.variant_number_per_file is None: args.variant_number_per_file = xlib.Const.DEFAULT_VARIANT_NUMBER_PER_FILE elif not xlib.check_int(args.variant_number_per_file, minimum=1): xlib.Message.print( 'error', 'The variant number per file has to be an integer number greater than 0.' ) OK = False else: args.variant_number_per_file = int(args.variant_number_per_file) # check "allele_transformation" if args.allele_transformation is None: args.allele_transformation = 'NONE' elif not xlib.check_code(args.allele_transformation, xlib.get_allele_transformation_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** The allele transformation has to be {xlib.get_allele_transformation_code_list_text()}.' ) OK = False else: args.allele_transformation = args.allele_transformation.upper() # check "verbose" if args.verbose is None: args.verbose = xlib.Const.DEFAULT_VERBOSE elif not xlib.check_code( args.verbose, xlib.get_verbose_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** verbose has to be {xlib.get_verbose_code_list_text()}.') OK = False if args.verbose.upper() == 'Y': xlib.Message.set_verbose_status(True) # check "trace" if args.trace is None: args.trace = xlib.Const.DEFAULT_TRACE elif not xlib.check_code( args.trace, xlib.get_trace_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.') OK = False if args.trace.upper() == 'Y': xlib.Message.set_trace_status(True) # check "tvi_list" if args.tvi_list is None or args.tvi_list == 'NONE': args.tvi_list = [] else: args.tvi_list = xlib.split_literal_to_string_list(args.tvi_list) # check the identification set if OK: if args.sp1_id == args.sp2_id or \ args.hybrid_id is not None and (args.sp1_id == args.hybrid_id or args.sp2_id == args.hybrid_id): xlib.Message.print('error', 'The identifications must be different.') OK = False # if there are errors, exit with exception if not OK: raise xlib.ProgramException('', 'P001')
def load_table_kegg_ids(conn, kegg_id_file): ''' ''' # set the pattern of the data records # format: kegg_id\tthreshold\tscore_type\tprofile_type\tF-measure\tnseq\tnseq_used\talen\tmlen\teff_nseq\tre/pos\tdefinition record_pattern = re.compile(r'^(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)$') # drop table "kegg_ids" xlib.Message.print('verbose', 'Droping the table "kegg_ids" ...\n') xsqlite.drop_kegg_ids(conn) xlib.Message.print('verbose', 'The table is droped.\n') # create table "kegg_ids" xlib.Message.print('verbose', 'Creating the table "kegg_ids" ...\n') xsqlite.create_kegg_ids(conn) xlib.Message.print('verbose', 'The table is created.\n') # open the file of KEGG ids if kegg_id_file.endswith('.gz'): try: kegg_id_file_id = gzip.open(kegg_id_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F002', kegg_id_file) else: try: kegg_id_file_id = open(kegg_id_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException('F001', kegg_id_file) # initialize the record counter record_counter = 0 # initialize the inserted row counter inserted_row_counter = 0 # initialize the header record control header_record = True # read the first record record = kegg_id_file_id.readline() # while there are records while record != '': # add 1 to the record counter record_counter += 1 # process the header record if header_record: header_record = False # process data records else: # initialize the row data dictionary row_dict = {} # extract data try: mo = record_pattern.match(record) row_dict['kegg_id'] = mo.group(1).strip().lower() # definition format: description [EC:ec_id] definition = mo.group(12).strip() open_bracket_pos = definition.find('[') if open_bracket_pos > -1: row_dict['desc'] = definition[:open_bracket_pos].strip() row_dict['ec_id'] = definition[open_bracket_pos+4:-1].strip() else: row_dict['desc'] = definition row_dict['ec_id'] = 'N/A' except Exception as e: raise xlib.ProgramException('F006', os.path.basename(kegg_id_file), record_counter) # change quotation marks and semicolons in "desc" row_dict['desc'] = row_dict['desc'].replace("'", '|').replace(';', ',') # insert data into table "kegg_ids" xsqlite.insert_kegg_ids_row(conn, row_dict) inserted_row_counter += 1 # print record counter xlib.Message.print('verbose', f'\rProcessed records of KEGG ids file: {record_counter} - Inserted rows: {inserted_row_counter}') # read the next record record = kegg_id_file_id.readline() xlib.Message.print('verbose', '\n') # create the index on the table "kegg_ids" xlib.Message.print('verbose', 'Creating the index on the table "kegg_ids" ...\n') xsqlite.create_kegg_ids_index(conn) xlib.Message.print('verbose', 'The index is created.\n') # save changes into TOA database xlib.Message.print('verbose', 'Saving changes into TOA database ...\n') conn.commit() xlib.Message.print('verbose', 'Changes are saved.\n') # close kegg_ids file kegg_id_file_id.close()
def build_allele_frequency(vcf_file, sample_file, sp1_id, sp2_id, hybrid_id, output_dir, variant_number_per_file, allele_transformation, tvi_list): ''' Filter and fixes variant data of a VCF file. ''' # initialize the sample number sample_number = 0 # initialize counters input_record_counter = 0 total_variant_counter = 0 # get the sample data sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id) # initialize the sample species and mother identification lists per variant species_id_list = [] mother_id_list = [] # initialize the maximum allele number per varaint maximum_allele_number = 0 # initialize allele frequency dictionaries allele_frequency_dict_1 = {} allele_frequency_dict_2 = {} # initialize ATCG conversión dictionary # A -> 1; T -> 2; C -> 3; G -> 4 atcg = 'ATCG' atcg_conversion_dict = {} # open the input VCF file if vcf_file.endswith('.gz'): try: vcf_file_id = gzip.open(vcf_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', vcf_file) else: try: vcf_file_id = open(vcf_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', vcf_file) # read the first record of input VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # while there are records in input VCF file while record != '': # process metadata records while record != '' and record.startswith('##'): # add 1 to the read sequence counter input_record_counter += 1 # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... { total_variant_counter:8d}' ) # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # process the column description record if record.startswith('#CHROM'): # add 1 to the read sequence counter input_record_counter += 1 # get the record data list record_data_list = data_dict['record_data_list'] # build the sample species and mother identification lists per variant for i in range(9, len(record_data_list)): try: species_id = sample_dict[record_data_list[i]]['species_id'] mother_id = sample_dict[record_data_list[i]]['mother_id'] except Exception as e: raise xlib.ProgramException(e, 'L002', record_data_list[i]) species_id_list.append(species_id) mother_id_list.append(mother_id) # check if the sample species list is empty if species_id_list == []: raise xlib.ProgramException(e, 'L003') # set the sample number sample_number = len(species_id_list) # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d}' ) # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # process variant record while record != '' and not record.startswith( '##') and not record.startswith('#CHROM'): # add set the variant identification variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}' # add 1 to the read sequence counter input_record_counter += 1 # add 1 to the total variant counter total_variant_counter += 1 if variant_id in tvi_list: xlib.Message.print( 'trace', f'\n\n\n\nseq_id: {data_dict["chrom"]} - position {data_dict["pos"]}' ) if variant_id in tvi_list: xlib.Message.print( 'trace', f'total_variant_counter: {total_variant_counter}') # get the reference bases (field REF) and alternative alleles (field ALT) reference_bases = data_dict['ref'] alternative_alleles = data_dict['alt'] # build the alternative alleles list from field ALT alternative_allele_list = data_dict['alt'].split(',') # build ATCG conversion list atcg_conversion_list = [] index = atcg.find(reference_bases.upper()) if index == -1: raise xlib.ProgramException('', 'L016') else: atcg_conversion_list.append(index + 1) for i in range(len(alternative_allele_list)): index = atcg.find(alternative_allele_list[i].upper()) if index == -1: raise xlib.ProgramException('', 'L016') else: atcg_conversion_list.append(index + 1) atcg_conversion_dict[total_variant_counter] = atcg_conversion_list # get the position of the genotype (subfield GT) in the field FORMAT format_subfield_list = data_dict['format'].upper().split(':') try: gt_position = format_subfield_list.index('GT') except Exception as e: raise xlib.ProgramException(e, 'L007', 'GT', data_dict['chrom'], data_dict['pos']) # build the list of sample genotypes of a variant sample_gt_list = [] for i in range(sample_number): sample_data_list = data_dict['sample_list'][i].split(':') sample_gt_list.append(sample_data_list[gt_position]) # build the lists of the left and right side of sample genotypes of a variant sample_gt_left_list = [] sample_gt_right_list = [] for i in range(sample_number): sep = '/' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: sep = '|' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: raise xlib.ProgramException('L008', 'GT', data_dict['chrom'], data_dict['pos']) sample_gt_left_list.append(sample_gt_list[i][:sep_pos]) sample_gt_right_list.append(sample_gt_list[i][sep_pos + 1:]) if variant_id in tvi_list: xlib.Message.print('trace', f'reference_bases: {reference_bases}') if variant_id in tvi_list: xlib.Message.print( 'trace', f'alternative_allele_list: {alternative_allele_list}') if variant_id in tvi_list: xlib.Message.print('trace', f'sample_gt_list: {sample_gt_list}') # get the allele counters per species allele_counter_dict_1 = {} allele_counter_dict_2 = {} allele_counter_dict_h = {} for i in range(sample_number): # only when the sample is an adult if mother_id_list[i] == 'NONE': if sample_gt_left_list[i] != xlib.get_md_symbol(): if species_id_list[i] == sp1_id: allele_counter_dict_1[sample_gt_left_list[ i]] = allele_counter_dict_1.get( sample_gt_left_list[i], 0) + 1 elif species_id_list[i] == sp2_id: allele_counter_dict_2[sample_gt_left_list[ i]] = allele_counter_dict_2.get( sample_gt_left_list[i], 0) + 1 else: allele_counter_dict_h[sample_gt_left_list[ i]] = allele_counter_dict_h.get( sample_gt_left_list[i], 0) + 1 if sample_gt_right_list[i] != xlib.get_md_symbol(): if species_id_list[i] == sp1_id: allele_counter_dict_1[sample_gt_right_list[ i]] = allele_counter_dict_1.get( sample_gt_right_list[i], 0) + 1 elif species_id_list[i] == sp2_id: allele_counter_dict_2[sample_gt_right_list[ i]] = allele_counter_dict_2.get( sample_gt_right_list[i], 0) + 1 else: allele_counter_dict_h[sample_gt_right_list[ i]] = allele_counter_dict_h.get( sample_gt_right_list[i], 0) + 1 if variant_id in tvi_list: xlib.Message.print( 'trace', f'allele_counter_dict_1: {allele_counter_dict_1}') if variant_id in tvi_list: xlib.Message.print( 'trace', f'allele_counter_dict_2: {allele_counter_dict_2}') if variant_id in tvi_list: xlib.Message.print( 'trace', f'allele_counter_dict_h: {allele_counter_dict_h}') # calculate the maximum allele number if maximum_allele_number < len(allele_counter_dict_1.keys()): maximum_allele_number = len(allele_counter_dict_1.keys()) if maximum_allele_number < len(allele_counter_dict_2.keys()): maximum_allele_number = len(allele_counter_dict_2.keys()) # calculate the variant allele frecuencies per species allele_frequency_dict_1[total_variant_counter] = {} sp1_allele_total = 0 for allele in allele_counter_dict_1.keys(): sp1_allele_total += allele_counter_dict_1[allele] for allele in allele_counter_dict_1.keys(): allele_frequency_dict_1[total_variant_counter][ allele] = allele_counter_dict_1[allele] / sp1_allele_total if variant_id in tvi_list: xlib.Message.print( 'trace', f'allele_frequency_dict_1[{total_variant_counter}][{allele}]: {allele_frequency_dict_1[total_variant_counter][allele]}' ) allele_frequency_dict_2[total_variant_counter] = {} sp2_allele_total = 0 for allele in allele_counter_dict_2.keys(): sp2_allele_total += allele_counter_dict_2[allele] for allele in allele_counter_dict_2.keys(): allele_frequency_dict_2[total_variant_counter][ allele] = allele_counter_dict_2[allele] / sp2_allele_total if variant_id in tvi_list: xlib.Message.print( 'trace', f'allele_frequency_dict_2[{total_variant_counter}][{allele}]: {allele_frequency_dict_2[total_variant_counter][allele]}' ) # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d}' ) # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) xlib.Message.print('verbose', '\n') # close the VCF file vcf_file_id.close() # calculate the output SimHyb file number simhyb_file_num = math.ceil(total_variant_counter / variant_number_per_file) # initialize the begin and end variant begin_variant = 1 end_variant = variant_number_per_file if variant_number_per_file < total_variant_counter else total_variant_counter # write the variant allele frecuencies per species in the output SimHyb files for i in range(simhyb_file_num): xlib.Message.print( 'trace', '\n\n\n\nbegin_variant: {} - end_variant: {}'.format( begin_variant, end_variant)) # set the SimHyb file name if vcf_file.endswith('.gz'): file_name, file_extension = os.path.splitext( os.path.basename(vcf_file[:-3])) else: file_name, file_extension = os.path.splitext( os.path.basename(vcf_file)) if simhyb_file_num == 1: current_simhyb_file = f'{output_dir}/{file_name}-allelefreq.csv' else: current_simhyb_file = f'{output_dir}/{file_name}-allelefreq-{i:03d}.csv' # open the output SimHyb file if current_simhyb_file.endswith('.gz'): try: current_simhyb_file_id = gzip.open(current_simhyb_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F004', current_simhyb_file) else: try: current_simhyb_file_id = open(current_simhyb_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', current_simhyb_file) # write allele frequency records for i in range(maximum_allele_number): xlib.Message.print('trace', f'i: {i}') # initialize the variable to control the record begin is_begin = True # species 1 for j in range(begin_variant, end_variant + 1): xlib.Message.print('trace', f'j: {j}') # get the allele and its frequency variant_data_dict = allele_frequency_dict_1.get(j, {}) xlib.Message.print('trace', f'variant_data_dict: {variant_data_dict}') if variant_data_dict == {}: allele = 0 allele_frequency = 0 else: allele_list = sorted(variant_data_dict.keys()) if i < len(allele_list): allele = allele_list[i] allele_frequency = variant_data_dict[allele] if allele_transformation == 'ADD100' and xlib.check_int( allele): allele = int(allele) + 100 elif allele_transformation == 'ATCG': allele = atcg_conversion_dict[j][int(allele)] else: allele = 0 allele_frequency = 0 # write the part of this record corresponding with the sample if is_begin: record_part = f'{allele};{allele_frequency}' is_begin = False else: record_part = f';{allele};{allele_frequency}' current_simhyb_file_id.write(record_part) # species 2 for j in range(begin_variant, end_variant + 1): # get the allele and its frequency variant_data_dict = allele_frequency_dict_2.get(j, {}) if variant_data_dict == {}: allele = 0 allele_frequency = 0 else: allele_list = sorted(variant_data_dict.keys()) if i < len(allele_list): allele = allele_list[i] allele_frequency = variant_data_dict[allele] if allele_transformation == 'ADD100' and xlib.check_int( allele): allele = int(allele) + 100 elif allele_transformation == 'ATCG': allele = atcg_conversion_dict[j][int(allele)] else: allele = 0 allele_frequency = 0 # write the part of this record corresponding with the variant record_part = f';{allele};{allele_frequency}' current_simhyb_file_id.write(record_part) # write the end of the record current_simhyb_file_id.write('\n') # close SymHyb file current_simhyb_file_id.close() # print OK message xlib.Message.print( 'info', f'The SimHyb file {os.path.basename(current_simhyb_file)} is created.' ) # set the new begin and end variant begin_variant = end_variant + 1 end_variant = begin_variant + variant_number_per_file - 1 if begin_variant + variant_number_per_file - 1 < total_variant_counter else total_variant_counter
def filter_transcripts(assembly_software_code, transcriptome_file, score_file, output_file, minlen, maxlen, minFPKM, minTPM): ''' Filter transcripts according to their length, FPKM and TPM. ''' # initialize the transcripts dictionary transcripts_dict = {} # open the score file if score_file.endswith('.gz'): try: score_file_id = gzip.open(score_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', score_file) else: try: score_file_id = open(score_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', score_file) # read the first record of score file and find out lenght, and FPKM and TMP positions score_record = score_file_id.readline() data_list = score_record.split('\t') transcript_id_position = -1 length_position = -1 FPKM_position = -1 TPM_position = -1 i = 0 for datum in data_list: if datum.strip().upper().startswith('TRANSCRIPT_ID'): transcript_id_position = i if datum.strip().upper() == 'LENGTH': length_position = i elif datum.strip().upper() == 'FPKM': FPKM_position = i elif datum.strip().upper() == 'TPM': TPM_position = i i += 1 if transcript_id_position == -1 or length_position == -1 or FPKM_position == -1 or TPM_position == -1: raise xlib.ProgramException('', 'L015') # while there are records in score file, save theirs transcript id, lenght, FPKM and TPM score_record = score_file_id.readline() while score_record != '': data_list = score_record.split('\t') transcript_id = data_list[transcript_id_position].upper() try: length = float(data_list[length_position]) (integer_part, decimal_part) = divmod(length, 1) if decimal_part > 0: raise xlib.ProgramException('', 'D002', data_list[length_position], 'length') else: length = int(integer_part) except Exception as e: raise xlib.ProgramException(e, 'D002', data_list[length_position], 'length') try: FPKM = float(data_list[FPKM_position]) except Exception as e: raise xlib.ProgramException(e, 'D003', data_list[FPKM_position], 'FPKM') try: TPM = float(data_list[TPM_position]) except Exception as e: raise xlib.ProgramException(e, data_list[TPM_position], 'TPM') transcripts_dict[transcript_id] = { 'length': length, 'FPKM': FPKM, 'TPM': TPM } score_record = score_file_id.readline() # close score file score_file_id.close() # open the transcriptome file if transcriptome_file.endswith('.gz'): try: tanscriptome_file_id = gzip.open(transcriptome_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', transcriptome_file) else: try: tanscriptome_file_id = open(transcriptome_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', transcriptome_file) # open the ouput file if output_file.endswith('.gz'): try: output_file_id = gzip.open(output_file, mode='wt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', output_file) else: try: output_file_id = open(output_file, mode='w', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', output_file) ## initialize the count of transcripts and saved transcripts transcripts_count = 0 saved_transcripts_count = 0 # set the pattern of the head records (>transcriptome_info) pattern = r'^>(.*)$' # read the first record of transcriptome file tanscriptome_record = tanscriptome_file_id.readline() # while there are records in transcriptome file while tanscriptome_record != '': # process the head record if tanscriptome_record.startswith('>'): # extract the data mo = re.search(pattern, tanscriptome_record) transcript_info = mo.group(1) # check the origin if assembly_software_code == xlib.Const.AS_TRINITY_CODE and transcript_info[:7].upper( ) != 'TRINITY': raise xlib.ProgramException('', 'F007', tanscriptome_record) # get the transcript id transcript_id = transcript_info.split(' ')[0].upper() # initialize the transcript sequence transcript_seq = '' # read the next record tanscriptome_record = tanscriptome_file_id.readline() else: # control the FASTA format raise xlib.ProgramException('', 'F006', transcriptome_file, 'FASTA') # while there are records and they are sequence while tanscriptome_record != '' and not tanscriptome_record.startswith( '>'): # concatenate the record to the transcript sequence transcript_seq += tanscriptome_record.strip() # read the next record of transcriptome file tanscriptome_record = tanscriptome_file_id.readline() # add 1 to trascriptomes count transcripts_count += 1 # write the transcriptome_record in the output built if its length is between the minimum and maximum length, and FPKM and TPM are greater or equal to arguments values length = transcripts_dict.get(transcript_id, {}).get('length', 0) FPKM = transcripts_dict.get(transcript_id, {}).get('FPKM', 0) TPM = transcripts_dict.get(transcript_id, {}).get('TPM', 0) if length >= minlen and length <= maxlen and FPKM >= minFPKM and TPM >= minTPM: try: output_file_id.write(f'>{transcript_info}\n') output_file_id.write(f'{transcript_seq}\n') except Exception as e: raise xlib.ProgramException(e, 'F001', output_file) # add 1 to save trascripts count saved_transcripts_count += 1 # print the counters xlib.Message.print( 'verbose', f'\rTranscripts processed ... {transcripts_count:9d} - Transcripts saved ... {saved_transcripts_count:9d}' ) xlib.Message.print('verbose', '\n') # close transcriptome and output files tanscriptome_file_id.close() output_file_id.close() # print OK message print( f'\nThe file {os.path.basename(output_file)} containing the transcripts selected is created.' )
def collapse_indels(input_vcf_file, sample_file, imputed_md_id, sp1_id, sp2_id, hybrid_id, output_vcf_file, stats_file, tvi_list): ''' Collapses the variant records corresponding to an indel in a VCF file. ''' # initialize the sample number sample_number = 0 # get the sample data sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id) # initialize the sample, species and mother identification lists per variant sample_id_list = [] species_id_list = [] mother_id_list = [] # open the input VCF file if input_vcf_file.endswith('.gz'): try: input_vcf_file_id = gzip.open(input_vcf_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', input_vcf_file) else: try: input_vcf_file_id = open(input_vcf_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', input_vcf_file) # open the imputed VCF file if output_vcf_file.endswith('.gz'): try: output_vcf_file_id = gzip.open(output_vcf_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F004', output_vcf_file) else: try: output_vcf_file_id = open(output_vcf_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', output_vcf_file) # open the statistics file if stats_file.endswith('.gz'): try: stats_file_id = gzip.open(stats_file, mode='wt', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F004', stats_file) else: try: stats_file_id = open(stats_file, mode='w', encoding='iso-8859-1', newline='\n') except Exception as e: raise xlib.ProgramException(e, 'F003', stats_file) # write the statistics header stats_file_id.write('"seq_id";"position";"records";"length";"imputed"\n') # initialize counters input_record_counter = 0 total_variant_counter = 0 collapsed_variant_counter = 0 created_indel_counter = 0 # read the first record of input VCF file (record, _, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) # while there are records in input VCF file while record != '': # process metadata records while record != '' and record.startswith('##'): # add 1 to the read sequence counter input_record_counter += 1 # write the metadata record output_vcf_file_id.write(record) # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Collapsed variants ... {collapsed_variant_counter:8d} - Created indels ... {created_indel_counter}' ) # read the next record of the input VCF file (record, _, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) # process the column description record if record.startswith('#CHROM'): # add 1 to the read sequence counter input_record_counter += 1 # get the record data list record_data_list = data_dict['record_data_list'] # build the sample species and mother identification lists per variant for i in range(9, len(record_data_list)): try: sample_id = sample_dict[record_data_list[i]]['sample_id'] species_id = sample_dict[record_data_list[i]]['species_id'] mother_id = sample_dict[record_data_list[i]]['mother_id'] except Exception as e: raise xlib.ProgramException(e, 'L002', record_data_list[i]) sample_id_list.append(sample_id) species_id_list.append(species_id) mother_id_list.append(mother_id) # check if the sample species list is empty if species_id_list == []: raise xlib.ProgramException('', 'L003') # set the sample number sample_number = len(species_id_list) # write the column description record output_vcf_file_id.write(record) # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Collapsed variants ... {collapsed_variant_counter:8d} - Created indels ... {created_indel_counter}' ) # read the next record of the input VCF file (record, _, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) # process variant record while record != '' and not record.startswith( '##') and not record.startswith('#CHROM'): xlib.Message.print('trace', f'Iniciando...') # set the sequence identification and position control variables w_seq_id = data_dict['chrom'] w_position = int(data_dict['pos']) # initialize the record counter of the "actual" variant actual_variant_record_counter = 0 # initialize the reference bases (field REF) reference_bases = '' # initialize the found best sample list control variable found_best_sample_list = False # initialize the collapse control variable collapse = True # process variant records of same "actual" variant while record != '' and not record.startswith( '##' ) and not record.startswith( '#CHROM' ) and data_dict['chrom'] == w_seq_id and int( data_dict['pos'] ) == w_position + actual_variant_record_counter and collapse: xlib.Message.print('trace', f'Inside the loop') xlib.Message.print( 'trace', f'data_dict["chrom"]: {data_dict["chrom"]} - w_seq_id: {w_seq_id} - position: {data_dict["pos"]} - w_position: {w_position} - actual_variant_record_counter: {actual_variant_record_counter}' ) # add set the variant identification variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}' if variant_id in tvi_list: xlib.Message.print( 'trace', f'\n\n\n\nseq_id: {data_dict["chrom"]} - position {data_dict["pos"]}' ) # add 1 to the read sequence counter input_record_counter += 1 # add 1 to the total variant counter total_variant_counter += 1 # add 1 to the record counter of the "actual" variant actual_variant_record_counter += 1 # get the position of the genotype (subfield GT) in the field FORMAT format_subfield_list = data_dict['format'].upper().split(':') try: gt_position = format_subfield_list.index('GT') except Exception as e: raise xlib.ProgramException(e, 'L007', 'GT', data_dict['chrom'], data_dict['pos']) # build the list of sample genotypes of a variant sample_gt_list = [] for i in range(sample_number): sample_data_list = data_dict['sample_list'][i].split(':') sample_gt_list.append(sample_data_list[gt_position]) # build the lists of the left and right side of sample genotypes of a variant sample_gt_left_list = [] sample_sep_list = [] sample_gt_right_list = [] for i in range(sample_number): sep = '/' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: sep = '|' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: raise xlib.ProgramException('', 'L008', 'GT', data_dict['chrom'], data_dict['pos']) sample_sep_list.append(sep) sample_gt_left_list.append(sample_gt_list[i][:sep_pos]) sample_gt_right_list.append(sample_gt_list[i][sep_pos + 1:]) if variant_id in tvi_list: xlib.Message.print('trace', f'sample_gt_list: {sample_gt_list}') # initialize imputation control variable imputed_adult_count = 0 # check for i in range(sample_number): # only when the sample is adult if mother_id_list[i] == 'NONE': # check if there are imputed data if sample_gt_left_list[ i] == imputed_md_id or sample_gt_right_list[ i] == imputed_md_id: imputed_adult_count += 1 xlib.Message.print( 'trace', f'variant_id: {variant_id} - imputed_adult_count: {imputed_adult_count}' ) # concat the current reference bases to the new reference bases reference_bases = f'{reference_bases}{data_dict["ref"]}' # if there are not imputed adults if imputed_adult_count == 0: id = data_dict['id'] alternative_alleles = data_dict['alt'] qual = data_dict['qual'] filter = data_dict['filter'] info = data_dict['info'] format = data_dict['format'] best_sample_list = data_dict['sample_list'] collapse = False # if there are imputed adults else: if actual_variant_record_counter == 1: id = data_dict['id'] alternative_alleles = data_dict['alt'] qual = data_dict['qual'] filter = data_dict['filter'] info = data_dict['info'] format = data_dict['format'] best_sample_list = data_dict['sample_list'] if alternative_alleles == xlib.get_md_symbol(): found_best_sample_list = True elif not found_best_sample_list and data_dict[ 'alt'] == xlib.get_md_symbol(): id = data_dict['id'] alternative_alleles = xlib.get_md_symbol() qual = data_dict['qual'] filter = data_dict['filter'] info = data_dict['info'] format = data_dict['format'] best_sample_list = data_dict['sample_list'] found_best_sample_list = True # read the next record of the input VCF file xlib.Message.print('trace', f'Reading ...') (record, _, data_dict) = xlib.read_vcf_file(input_vcf_file_id, sample_number) if record != '': xlib.Message.print( 'trace', f'data_dict["chrom"]: {data_dict["chrom"]} - w_seq_id: {w_seq_id} - position: {data_dict["pos"]} - w_position: {w_position} - actual_variant_record_counter: {actual_variant_record_counter}' ) # write the variant record xlib.Message.print('trace', f'Writing VCF ...') xlib.Message.print( 'trace', f'w_seq_id: {w_seq_id} - w_position: {w_position} - actual_variant_record_counter: {actual_variant_record_counter}' ) sample_list_text = '\t'.join(best_sample_list) output_vcf_file_id.write( f'{w_seq_id}\t{w_position}\t{id}\t{reference_bases}\t{alternative_alleles}\t{qual}\t{filter}\t{info}\t{format}\t{sample_list_text}\n' ) # write the collapsing statistics record xlib.Message.print('trace', f'Writing stats...') is_imputed = 'IMPUTED' if imputed_adult_count > 0 else '-' stats_file_id.write( f'{w_seq_id};{w_position};{actual_variant_record_counter};{len(reference_bases)};{is_imputed}\n' ) # print the counters xlib.Message.print( 'verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - Collapsed variants ... {collapsed_variant_counter:8d} - Created indels ... {created_indel_counter}' ) xlib.Message.print('verbose', '\n') # close files input_vcf_file_id.close() output_vcf_file_id.close() stats_file_id.close() # print OK message xlib.Message.print( 'info', f'The file {os.path.basename(output_vcf_file)} is created.')
def check_args(args): ''' Verity the input arguments. ''' # initialize the control variable OK = True # check loci_file_path if args.loci_file_path is None: xlib.Message.print( 'error', '*** A loci file path is not indicated in the input arguments.') OK = False else: if not os.path.isfile(args.loci_file_path): xlib.Message.print( 'error', f'*** The file {args.loci_file_path} does not exist.') OK = False if not args.loci_file_path.endswith('.loci'): xlib.Message.print( 'error', f'*** The file {args.loci_file_path} does not end in ".loci".') OK = False # check stats_file_path if args.stats_file_path is None: xlib.Message.print( 'error', '*** A statistics path is not indicated in the input arguments.') OK = False else: if not args.stats_file_path.endswith('.csv'): xlib.Message.print( 'error', f'*** The file {args.stats_file_path} does not end in ".csv".') OK = False # check "verbose" if args.verbose is None: args.verbose = xlib.Const.DEFAULT_VERBOSE elif not xlib.check_code( args.verbose, xlib.get_verbose_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** verbose has to be {xlib.get_verbose_code_list_text()}.') OK = False if args.verbose.upper() == 'Y': xlib.Message.set_verbose_status(True) # check "trace" if args.trace is None: args.trace = xlib.Const.DEFAULT_TRACE elif not xlib.check_code( args.trace, xlib.get_trace_code_list(), case_sensitive=False): xlib.Message.print( 'error', f'*** trace has to be {xlib.get_trace_code_list_text()}.') OK = False if args.trace.upper() == 'Y': xlib.Message.set_trace_status(True) # if there are errors, exit with exception if not OK: raise xlib.ProgramException('', 'P001')
def load_vcf_data(conn, vcf_file, sample_file, sp1_id, sp2_id, hybrid_id, imputed_md_id, new_md_id, allele_transformation, tvi_list): ''' Load data of a VCF file. ''' # get the sample data sample_dict = xlib.get_sample_data(sample_file, sp1_id, sp2_id, hybrid_id) # drop table "vcf_samples" (if it exists) xlib.Message.print('verbose', 'Droping the table "vcf_samples" ...\n') xsqlite.drop_vcf_samples(conn) xlib.Message.print('verbose', 'The table is droped.\n') # create table "vcf_samples" xlib.Message.print('verbose', 'Creating the table "vcf_samples" ...\n') xsqlite.create_vcf_samples(conn) xlib.Message.print('verbose', 'The table is created.\n') # insert samples data into table "vcf_samples" xlib.Message.print('verbose', 'Inserting sample data into the table "vcf_samples" ...\n') for key, value in sample_dict.items(): value['type'] = 'N/A' xsqlite.insert_vcf_samples_row(conn, value) xlib.Message.print('verbose', 'Data are inserted.\n') # create index "vcf_samples_index" with columns "dataset_id" and "gene_id" (if not exists) xlib.Message.print('verbose', 'Creating the index on the table "vcf_samples" (if it does not exist) ...\n') xsqlite.create_vcf_samples_index(conn) xlib.Message.print('verbose', 'The index is created.\n') # get the sample type dictionary sample_type_dict = xsqlite.get_sample_type_dict(conn) # update the type of each sample for key in sample_type_dict.keys(): xsqlite.update_vcf_samples_row(conn, sample_type_dict[key]['sample_id'], sample_type_dict[key]['type']) # drop table "vcf_variants" (if it exists) xlib.Message.print('verbose', 'Droping the table "vcf_variants" ...\n') xsqlite.drop_vcf_variants(conn) xlib.Message.print('verbose', 'The table is droped.\n') # create table "vcf_variants" xlib.Message.print('verbose', 'Creating the table "vcf_variants" ...\n') xsqlite.create_vcf_variants(conn) xlib.Message.print('verbose', 'The table is created.\n') # drop table "vcf_alleles" (if it exists) xlib.Message.print('verbose', 'Droping the table "vcf_alleles" ...\n') xsqlite.drop_vcf_alleles(conn) xlib.Message.print('verbose', 'The table is droped.\n') # create table "vcf_alleles" xlib.Message.print('verbose', 'Creating the table "vcf_alleles" ...\n') xsqlite.create_vcf_alleles(conn) xlib.Message.print('verbose', 'The table is created.\n') # drop table "vcf_samples_alleles" (if it exists) xlib.Message.print('verbose', 'Droping the table "vcf_samples_alleles" ...\n') xsqlite.drop_vcf_samples_alleles(conn) xlib.Message.print('verbose', 'The table is droped.\n') # create table "vcf_samples_alleles" xlib.Message.print('verbose', 'Creating the table "vcf_samples_alleles" ...\n') xsqlite.create_vcf_samples_alleles(conn) xlib.Message.print('verbose', 'The table is created.\n') # initialize the row data dictionary corresponding to the tables "vcf_variants" and "vcf_samples_alleles" vcf_variants_row_dict = {} vcf_alleles_row_dict = {} vcf_samples_alleles_row_dict = {} # build the list of imputed and missing data alleles M_I_list = [imputed_md_id, xlib.get_md_symbol()] # initialize the sample number sample_number = 0 # initialize counters input_record_counter = 0 total_variant_counter = 0 vcf_variants_inserted_row_counter = 0 vcf_alleles_inserted_row_counter = 0 vcf_samples_alleles_inserted_row_counter = 0 # initialize the sample species and mother identification lists per variant sample_id_list = [] species_id_list = [] mother_id_list = [] # open the input VCF file if vcf_file.endswith('.gz'): try: vcf_file_id = gzip.open(vcf_file, mode='rt', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F002', vcf_file) else: try: vcf_file_id = open(vcf_file, mode='r', encoding='iso-8859-1') except Exception as e: raise xlib.ProgramException(e, 'F001', vcf_file) # read the first record of input VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # while there are records in input VCF file while record != '': # process metadata records while record != '' and record.startswith('##'): # add 1 to the read sequence counter input_record_counter += 1 # print the counters xlib.Message.print('verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... { total_variant_counter:8d}') # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # process the column description record if record.startswith('#CHROM'): # add 1 to the read sequence counter input_record_counter += 1 # get the record data list record_data_list = data_dict['record_data_list'] # build the sample species and mother identification lists per variant for i in range(9, len(record_data_list)): try: sample_id = record_data_list[i] species_id = sample_dict[record_data_list[i]]['species_id'] mother_id = sample_dict[record_data_list[i]]['mother_id'] except Exception as e: raise xlib.ProgramException(e, 'L002', record_data_list[i]) sample_id_list.append(sample_id) species_id_list.append(species_id) mother_id_list.append(mother_id) # check if the sample species list is empty if species_id_list == []: raise xlib.ProgramException('', 'L003') # set the sample number sample_number = len(species_id_list) # print the counters xlib.Message.print('verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d}') # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) # process variant record while record != '' and not record.startswith('##') and not record.startswith('#CHROM'): # add set the variant identification variant_id = f'{data_dict["chrom"]}-{data_dict["pos"]}' # add 1 to the read sequence counter input_record_counter += 1 # add 1 to the total variant counter total_variant_counter += 1 if variant_id in tvi_list: xlib.Message.print('trace', f'\n\n\n\nseq_id: {data_dict["chrom"]} - position {data_dict["pos"]}') if variant_id in tvi_list: xlib.Message.print('trace', f'total_variant_counter: {total_variant_counter}') # get the reference bases (field REF) and alternative alleles (field ALT) reference_bases = data_dict['ref'] alternative_alleles = data_dict['alt'] # build the alternative alleles list from field ALT alternative_allele_list = data_dict['alt'].split(',') # build the alleles list from reference bases and alternative alleles list if alternative_alleles == xlib.get_md_symbol(): alleles_list = [reference_bases] else: alleles_list = [reference_bases] + alternative_allele_list # check if the variant is an indel (both SAMtools/BCFtools and Freebayes) or SNP or multiallelic or N/A variant_type = '' if alternative_alleles == xlib.get_md_symbol(): variant_type = 'N/A' else: is_indel = False if len(reference_bases) > 1: is_indel = True else: for alternative_allele in alternative_allele_list: if len(alternative_allele) > 1: is_indel = True break if is_indel: variant_type = 'INDEL' elif len(alternative_allele_list) > 1: variant_type = 'MULTIALLELIC' else: variant_type = 'SNP' # get the position of the genotype (subfield GT) in the field FORMAT format_subfield_list = data_dict['format'].upper().split(':') try: gt_position = format_subfield_list.index('GT') except Exception as e: raise xlib.ProgramException(e, 'L007', 'GT', data_dict['chrom'], data_dict['pos']) # build the list of sample genotypes of a variant sample_gt_list = [] for i in range(sample_number): sample_data_list = data_dict['sample_list'][i].split(':') sample_gt_list.append(sample_data_list[gt_position]) # build the lists of the left and right side of sample genotypes of a variant sample_gt_left_list = [] sample_gt_right_list = [] for i in range(sample_number): sep = '/' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: sep = '|' sep_pos = sample_gt_list[i].find(sep) if sep_pos == -1: raise xlib.ProgramException('', 'L008', 'GT', data_dict['chrom'], data_dict['pos']) sample_gt_left_list.append(sample_gt_list[i][:sep_pos]) sample_gt_right_list.append(sample_gt_list[i][sep_pos+1:]) if variant_id in tvi_list: xlib.Message.print('trace', f'reference_bases: {reference_bases}') if variant_id in tvi_list: xlib.Message.print('trace', f'alternative_allele_list: {alternative_allele_list}') if variant_id in tvi_list: xlib.Message.print('trace', f'sample_gt_list: {sample_gt_list}') # set data and insert row into the table "vcf_variants" vcf_variants_row_dict['variant_id'] = variant_id vcf_variants_row_dict['seq_id'] = data_dict['chrom'] vcf_variants_row_dict['position'] = data_dict['pos'] vcf_variants_row_dict['reference_bases'] = reference_bases vcf_variants_row_dict['alternative_alleles'] = alternative_alleles vcf_variants_row_dict['variant_type'] = variant_type xsqlite.insert_vcf_variants_row(conn, vcf_variants_row_dict) vcf_variants_inserted_row_counter += 1 # set data and insert rows into the table "vcf_alleles" vcf_alleles_row_dict['variant_id'] = variant_id # reference bases and alternative alleles for j in range(len(alleles_list)): vcf_alleles_row_dict['allele_id'] = str(j) vcf_alleles_row_dict['bases'] = alleles_list[j] if xlib.check_int(j) and allele_transformation == 'ADD100': structure_allele_id = str(int(j) + 100) else: structure_allele_id = j vcf_alleles_row_dict['structure_allele_id'] = structure_allele_id xsqlite.insert_vcf_alleles_row(conn, vcf_alleles_row_dict) vcf_alleles_inserted_row_counter += 1 # missing data vcf_alleles_row_dict['allele_id'] = xlib.get_md_symbol() vcf_alleles_row_dict['bases'] = 'N/D' if xlib.check_int(new_md_id) and allele_transformation == 'ADD100': structure_allele_id = str(int(new_md_id) + 100) else: structure_allele_id = new_md_id vcf_alleles_row_dict['structure_allele_id'] = structure_allele_id xsqlite.insert_vcf_alleles_row(conn, vcf_alleles_row_dict) vcf_alleles_inserted_row_counter += 1 # imputed missing data vcf_alleles_row_dict['allele_id'] = imputed_md_id vcf_alleles_row_dict['bases'] = 'N/D' if xlib.check_int(imputed_md_id) and allele_transformation == 'ADD100': structure_allele_id = str(int(imputed_md_id) + 100) else: structure_allele_id = imputed_md_id vcf_alleles_row_dict['structure_allele_id'] = structure_allele_id xsqlite.insert_vcf_alleles_row(conn, vcf_alleles_row_dict) vcf_alleles_inserted_row_counter += 1 # set data and insert rows into the table "vcf_samples_alleles" vcf_samples_alleles_row_dict['variant_id'] = variant_id for i in range(sample_number): vcf_samples_alleles_row_dict['sample_id'] = sample_id_list[i] # initialize genotype distribution dictionary genotype_distribution_dict = {} for j in range(len(alleles_list)): genotype_distribution_dict[alleles_list[j]] = 0 for j in range(len(M_I_list)): genotype_distribution_dict[M_I_list[j]] = 0 # calculate genotype distribution dictionary if sample_gt_left_list[i] in M_I_list: genotype_distribution_dict[sample_gt_left_list[i]] += 1 else: genotype_distribution_dict[alleles_list[int(sample_gt_left_list[i])]] += 1 if sample_gt_right_list[i] in M_I_list: genotype_distribution_dict[sample_gt_right_list[i]] += 1 else: genotype_distribution_dict[alleles_list[int(sample_gt_right_list[i])]] += 1 # calculate precuency and insert rows for reference bases and alternative alleles for j in range(len(alleles_list)): if genotype_distribution_dict[alleles_list[j]] > 0: # -- vcf_samples_alleles_row_dict['allele_id'] = alleles_list[j] vcf_samples_alleles_row_dict['allele_id'] = j vcf_samples_alleles_row_dict['frecuency'] = genotype_distribution_dict[alleles_list[j]] / 2 xsqlite.insert_vcf_samples_alleles_row(conn, vcf_samples_alleles_row_dict) vcf_samples_alleles_inserted_row_counter += 1 # calculate precuency and insert rows for imputed missing data if genotype_distribution_dict[imputed_md_id] > 0: vcf_samples_alleles_row_dict['allele_id'] = imputed_md_id vcf_samples_alleles_row_dict['frecuency'] = genotype_distribution_dict[imputed_md_id] / 2 xsqlite.insert_vcf_samples_alleles_row(conn, vcf_samples_alleles_row_dict) vcf_samples_alleles_inserted_row_counter += 1 # calculate precuency and insert rows for missing data if genotype_distribution_dict[xlib.get_md_symbol()] > 0: vcf_samples_alleles_row_dict['allele_id'] = xlib.get_md_symbol() vcf_samples_alleles_row_dict['frecuency'] = genotype_distribution_dict[xlib.get_md_symbol()] / 2 xsqlite.insert_vcf_samples_alleles_row(conn, vcf_samples_alleles_row_dict) vcf_samples_alleles_inserted_row_counter += 1 # print the counters xlib.Message.print('verbose', f'\rProcessed records ... {input_record_counter:8d} - Total variants ... {total_variant_counter:8d} - vcf_variants ... {vcf_variants_inserted_row_counter:8d} - vcf_alleles ... {vcf_alleles_inserted_row_counter:8d} - vcf_samples_alleles ... {vcf_samples_alleles_inserted_row_counter:8d}') # read the next record of the input VCF file (record, key, data_dict) = xlib.read_vcf_file(vcf_file_id, sample_number) xlib.Message.print('verbose', '\n') # create the index "vcf_variants_index" on the table "vcf_variants" xlib.Message.print('verbose', 'Creating the index on the table "vcf_variants" ...\n') xsqlite.create_vcf_variants_index(conn) xlib.Message.print('verbose', 'The index is created.\n') # create the index "vcf_alleles_index" on the table "vcf_alleles" xlib.Message.print('verbose', 'Creating the index on the table "vcf_alleles" ...\n') xsqlite.create_vcf_alleles_index(conn) xlib.Message.print('verbose', 'The index is created.\n') # create the index "vcf_samples_alleles_index" on the table "vcf_samples_alleles" xlib.Message.print('verbose', 'Creating the index on the table "vcf_samples_alleles" ...\n') xsqlite.create_vcf_samples_alleles_index(conn) xlib.Message.print('verbose', 'The index is created.\n') # save changes into NGShelper database xlib.Message.print('verbose', 'Saving changes into NGShelper database ...\n') conn.commit() xlib.Message.print('verbose', 'Changes are saved.\n') # close the VCF file vcf_file_id.close()