in_folder, o_file, gff_file = command_line() # Open GFF file and make dictionary containing gene names & their positions # Then add the per-library gene read counts to a dictionary gff_genes_dict = GFF.Parse(gff_file, create_nuc_dict=True) library_gene_counts = {} for (gene, [chromosome, spos, epos]) in gff_genes_dict: if not gff_genes_dict.is_transposon( gene) and not gff_genes_dict.is_common_rna(gene): library_gene_counts[gene] = [] for root, subfolders, files in os.walk(in_folder): for f_name in files: print("Working on", str(f_name)) f_in = os.path.join(root, f_name) parse_sam = SAM.Parse(f_in) parse_sam.reads_per_gene(gff_genes_dict) parse_sam.start() temp_gene_counts = parse_sam.get_reads_per_gene() # Count up the total number of reads in the library gzipped = False total_reads = 0 if f_in.endswith(".gz"): infile = gzip.open(f_in, 'rb') gzipped = True else: infile = open(f_in) for line in infile: if gzipped == True: line = line.decode('utf-8') line = line.split()
'-o', default= "/home/mattchat/SuecicaDupSearch/Data/RNA-Seq/Col0_leaf_RNA-seq/libWhan_10nc_gene_count.tsv", type=str, help='Output file for gene-mapped read counts', metavar='OutputFile') args = parser.parse_args() i_file = args.i gff_file = args.gff o_file = args.o return (i_file, gff_file, o_file) i_file, gff_file, o_file = command_line() #gff_obj = GFF.Parse_Genes(gff_file, "AtChr1;AtChr2;AtChr3;AtChr4;AtChr5") gff_obj = GFF.parse_genes(gff_file, "Chr1;Chr2;Chr3;Chr4;Chr5") parse_sam = SAM.Parse(i_file) parse_sam.reads_per_gene(gff_obj) parse_sam.start() gene_counts = parse_sam.get_reads_per_gene() with open(o_file, 'w') as outfile: outline = "Gene\tReads_mapped\n" outfile.write(outline) for gene, count in sorted(gene_counts.items(), key=lambda gene_counts: gene_counts[1], reverse=True): outline = str(gene) + "\t" + str(count) + "\n" outfile.write(outline)