예제 #1
0
def update_taxa(species_threshold=0.97, genus_threshold=0.80):
    logger = generic_logger('update_taxa.csv')

    run = Dir().dirname
    trim_dir = Dir(BASEDIR.join('data', 'trimmed_reads', run))
    assembly_dir = Dir(BASEDIR.join('data', 'assemblies', run))
    ani_dir = Dir(assembly_dir.join('ANI'))

    trimmed_reads = trim_dir.files(endswith="fq", dataframe=True)
    assemblies = assembly_dir.files(endswith='fna', dataframe=True)
    ani = ani_dir.files(endswith='.csv')

    for file in ani:
        try:
            df = pd.read_csv(file.path).sort_values('ani')
            sample_name = df.iloc[0].sample
            trim1 = trimmed_reads[
                (trimmed_reads.filename.str.contains(sample_name))
                & (trimmed_reads.filename.str.contains('_R1'))].iloc[0].path
            trim2 = trimmed_reads[
                (trimmed_reads.filename.str.contains(sample_name))
                & (trimmed_reads.filename.str.contains('_R2'))].iloc[0].path
            assembly = assemblies[(
                assemblies.filename.str.contains(sample_name))].iloc[0].path

            trim1 = File(trim1)
            trim2 = File(trim2)
            assembly = File(assembly)

            # ASSIGN TAXON
            taxon = 'UNKNOWN'
            possible_species = df[(df.ani >= species_threshold)]
            if len(possible_species) > 0:
                taxon = possible_species.iloc[0].taxon
            elif len(df[(df.ani >= genus_threshold)]) > 0:
                taxon = 'NTM'

            trim1_filename = trim1.filename
            trim2_filename = trim2.filename
            assembly_filename = assembly.filename

            trim1.rename(f'{sample_name}_{taxon}.fq.gz')
            trim2.rename(f'{sample_name}_{taxon}.fq.gz')
            assembly.rename(f'{sample_name}_{taxon}_000.fna')

            logger.info(f'renamed {trim1_filename} to {trim1.filename}')
            logger.info(f'renamed {trim2_filename} to {trim1.filename}')
            logger.info(f'renamed {assembly_filename} to {assembly.filename}')

        except Exception as e:
            logger.warning(e)
예제 #2
0
CP018363.1	2334023	G	A,T	0	30	30	SNP
CP018363.1	2948951	T	C,G	0	6	6	SNP
CP018363.1	3621547	T	C,G	0	35	35	SNP
CP018363.1	4243092	A	G,C	0	33	33	SNP
CP018363.1	4562156	T	C,G	0	15	15	SNP
"""


VCF_DIR = Dir("/Strong/proj/.data/alma")
FASTA_DIR = VCF_DIR.make_subdir("fasta")
MATRIX = FASTA_DIR.join("ALMA_matrix_N.fna")
CSV_OUT = FASTA_DIR.join("ALMA_stats.csv")
CSV_MUTATIONS = FASTA_DIR.join("ALMA_mutations.csv")
POSITIONS = 5626623

files = VCF_DIR.files(endswith='.cf')

records = []
multi_alleles = {}

print(f"Building Matrix: {MATRIX} from {len(files)} files.")
for i, file in enumerate(files):
    print(f"\t{i:02d} | {file.filename}")
    isolate = file.filename.split("_vs_")[0]
    seq = ['-'] * POSITIONS
    record = {'isolate': isolate, 'totalMulti': 0, 'N': 0, 'alt': 0, 'ref': 0}

    options=set()
    with open(file.path, 'r') as file_in, open(MATRIX, 'a+') as file_out:
        for line in file_in:
            # EXTRACT METADATA FROM LINE