def update_taxa(species_threshold=0.97, genus_threshold=0.80): logger = generic_logger('update_taxa.csv') run = Dir().dirname trim_dir = Dir(BASEDIR.join('data', 'trimmed_reads', run)) assembly_dir = Dir(BASEDIR.join('data', 'assemblies', run)) ani_dir = Dir(assembly_dir.join('ANI')) trimmed_reads = trim_dir.files(endswith="fq", dataframe=True) assemblies = assembly_dir.files(endswith='fna', dataframe=True) ani = ani_dir.files(endswith='.csv') for file in ani: try: df = pd.read_csv(file.path).sort_values('ani') sample_name = df.iloc[0].sample trim1 = trimmed_reads[ (trimmed_reads.filename.str.contains(sample_name)) & (trimmed_reads.filename.str.contains('_R1'))].iloc[0].path trim2 = trimmed_reads[ (trimmed_reads.filename.str.contains(sample_name)) & (trimmed_reads.filename.str.contains('_R2'))].iloc[0].path assembly = assemblies[( assemblies.filename.str.contains(sample_name))].iloc[0].path trim1 = File(trim1) trim2 = File(trim2) assembly = File(assembly) # ASSIGN TAXON taxon = 'UNKNOWN' possible_species = df[(df.ani >= species_threshold)] if len(possible_species) > 0: taxon = possible_species.iloc[0].taxon elif len(df[(df.ani >= genus_threshold)]) > 0: taxon = 'NTM' trim1_filename = trim1.filename trim2_filename = trim2.filename assembly_filename = assembly.filename trim1.rename(f'{sample_name}_{taxon}.fq.gz') trim2.rename(f'{sample_name}_{taxon}.fq.gz') assembly.rename(f'{sample_name}_{taxon}_000.fna') logger.info(f'renamed {trim1_filename} to {trim1.filename}') logger.info(f'renamed {trim2_filename} to {trim1.filename}') logger.info(f'renamed {assembly_filename} to {assembly.filename}') except Exception as e: logger.warning(e)
CP018363.1 2334023 G A,T 0 30 30 SNP CP018363.1 2948951 T C,G 0 6 6 SNP CP018363.1 3621547 T C,G 0 35 35 SNP CP018363.1 4243092 A G,C 0 33 33 SNP CP018363.1 4562156 T C,G 0 15 15 SNP """ VCF_DIR = Dir("/Strong/proj/.data/alma") FASTA_DIR = VCF_DIR.make_subdir("fasta") MATRIX = FASTA_DIR.join("ALMA_matrix_N.fna") CSV_OUT = FASTA_DIR.join("ALMA_stats.csv") CSV_MUTATIONS = FASTA_DIR.join("ALMA_mutations.csv") POSITIONS = 5626623 files = VCF_DIR.files(endswith='.cf') records = [] multi_alleles = {} print(f"Building Matrix: {MATRIX} from {len(files)} files.") for i, file in enumerate(files): print(f"\t{i:02d} | {file.filename}") isolate = file.filename.split("_vs_")[0] seq = ['-'] * POSITIONS record = {'isolate': isolate, 'totalMulti': 0, 'N': 0, 'alt': 0, 'ref': 0} options=set() with open(file.path, 'r') as file_in, open(MATRIX, 'a+') as file_out: for line in file_in: # EXTRACT METADATA FROM LINE