all_threads[i].start() while len(mp.active_children()) > 1: time.sleep(1) queue.put("FINISHED") while len(mp.active_children()) > 0: time.sleep(1) else: queue = open(output_filename, 'w') for chrom in sorted(list(input.keys())): generate_bedgraph_lines(input[chrom], chrom, queue, parallel=False) queue.close() #'chromosomes' contains the lengths of all chromosomes the that BEDGRAPH contains values for. genome = fu.import_genome(args.fasta) chromosomes = dict([(ID, len(transcript)) for ID, transcript in genome.items()]) #'coverage' is a nested dictionary of float vectors for each nucleotide in the genome. # Contains a dictionary for each BEDGRAPH file with values at each position. coverage = {} graph = open(args.input) coverage = {} for line in graph: chrom, start, end, count = line.rstrip().split() count = float(count) if chrom not in coverage: coverage[chrom] = {}
choices=['none', '5p', '3p', 'both']) parser.add_argument( "--nucfreqs", dest='NUCFREQS', help="Saves a table of nucleotide frequencies in selected genome region.", default=False, action="store_true") args = parser.parse_args() ###################################### # ENVIRONMENT SETUP: DATA STRUCTURES # ###################################### if args.FASTA: genome = fu.import_genome(args.FASTA) if args.SOFTCLIP_TYPE != 'none' and not args.FASTA: print( 'ERROR: Untemplated nucleotide analysis requires a reference genome. Use the --fasta argument.' ) sys.exit(1) chromosomes = {} file = open(args.SAMFILES[0]) line = file.readline() while line[0] == '@': print(line.rstrip()) l = line.rstrip().split('\t') if l[0] == '@SQ':
default=None, help='filepath to output G content table') args = parser.parse_args() def which(x, value=True): return [a for a, b in enumerate(x) if b == value] #'chromosomes' contains the lengths of all chromosomes the that BEDGRAPH contains values for. # Expects a two-column tab-separated file with: # chromosome length # Provided with the 'lengths' argument. if args.genome: genome = fu.import_genome(args.genome) chromosomes = {} lengths_file = open(args.lengths) for line in lengths_file: chrom, length = line.rstrip().split('\t') chromosomes[chrom] = int(length) #'coverage' is a nested dictionary of float vectors for each nucleotide in the genome. # Contains a dictionary for each BEDGRAPH file with values at each position. coverage = {} ingraphs = args.input for graph in ingraphs: print('Importing {}...'.format(graph)) coverage[graph] = {}
"""Returns a list of locations in x that satisfy value""" return [a for a, b in enumerate(x) if b == value] def notwhich(x, value=0): """Returns a list of locations in x that do not satisty value""" return [a for a, b in enumerate(x) if b != value] def flatten(list_of_lists): """Collapses a list/tuple of lists into a single list""" return [item for sublist in list_of_lists for item in sublist] if args.GENOME: genome = fu.import_genome(args.GENOME) else: if args.FEATURE != 'transcript': print("ERROR: cannot locate {} features without a reference genome.". format(args.FEATURE)) print("Provide genome FASTA file with -G") sys.exit(1) coverage = None if args.PLUS_BEDGRAPH: for i in args.PLUS_BEDGRAPH: if not coverage: coverage = bu.parse_bedgraph(i, '+') else: bu.add_bedgraph(coverage, i, '+')
'exon_nums', []) + [exon_number] ref_IDs = sorted(list(ref_transcripts.keys())) print('# {} reference transcripts: {}'.format(len(ref_IDs), args.reference_GFF)) # 'picked_IDs' is an array of IDs to use from the reference_GFF if args.subset: picked_IDs = [ i.rstrip().split('\t')[0] for i in open(args.subset).readlines() ] else: picked_IDs = ref_IDs # 'genome' is a dict of strings for each chromosome in 'genome_fasta' genome = fu.import_genome(args.genome_fasta) # 'chromosomes' contains the lengths of all chromosomes the that BEDGRAPH contains values for. chromosomes = {} for chrom in genome.keys(): length = len(genome[chrom]) chromosomes[chrom] = int(length) # 'coverage' is a dictionary of float vectors for each nucleotide in the genome. # Contains the value of the BEDGRAPH file at each position. coverage = {} coverage['+'] = {} coverage['-'] = {} for chrom, chromlen in chromosomes.items(): coverage['+'][chrom] = np.zeros(chromlen, dtype='float32')
] notkeys = [ 'B', 'D', 'H', 'V', 'V', '-', 'K', 'Y', 'S', 'W', 'R', 'M', 'T', 'G', 'C', 'A', 'N', '.' ] for k, v, c, n in zip(keys, values, complements, notkeys): IUPAChash[k] = v IUPACcomp[k] = c IUPACnot[k] = n ################################# # LOADING DATA FROM INPUT FILES # ################################# genome = fu.import_genome(args.genome_FASTA, keep_case=False) chromosomes = {} for k, v in genome.items(): chromosomes[k] = len(v) linecounter = 0 if __name__ == '__main__': for line in open(args.match_file): linecounter += 1 if line[0] == '#': continue try: BED_filename, search_sequence, mismatches = line.rstrip().split( ' ') except: