def main(fastq1, fastq2): with gzopen(fastq1) as f, gzopen(fastq2) as g: # Aggregate iterator of f,g iterators -> izip(f,g). for lineno, (line1, line2) in enumerate(izip(f, g)): # Take only sequence and quality on lines 1 and 3 (mod 4). modulo = lineno % 4 if modulo == 1: valid = False # Split on "CATG" and take the first fragment. # In case there is no "CATG", the barcode will be rejected # for being too long. brcd = line1.rstrip().split('CATG')[0] if not min_brcd < len(brcd) < max_brcd: continue # Use a Levenshtein automaton to find the transpsoson # sequence. Genomic position starts next position (equal # to 0 in case there is no match). gpos = transposon.end(line2) + 1 if not gpos: continue # Select the region from the end of the transposon to # the first "CATG", if any. genome = line2[gpos:].split('CATG')[0].rstrip() if len(genome) < min_genome: continue valid = True elif modulo == 3 and valid: qbrcd = score_from_quality(substr(line1, 0, len(brcd))) qgen = score_from_quality(substr(line2, gpos, len(genome))) sys.stdout.write('>%s:%d,%d\n%s\n' % (brcd, qbrcd, qgen, genome))
def match_reads(file_name_1, file_name_2): outf = {} passed = False file1 = gzopen(file_name_1) file2 = gzopen(file_name_2) for (lineno, line1) in enumerate(file1): line2 = file2.readline() modulo = lineno % 4 if modulo == 0 and passed: try: outf[index].write('@' + spotname + seq + '+\n' + quality) except KeyError: outf[index] = open(index + '.fastq', 'w') outf[index].write('@' + spotname + seq + '+\n' + quality) elif modulo == 1: index = line1[3:7] passed = test(line1[7:], line2) if passed: seq = line2[20:] spotname = line1[28:] elif modulo == 3 and passed: passed = testquality(line1[28:50], line2[20:50]) quality = line2[20:] file1.close() file2.close() for key in outf: outf[key].close()
def trimm_hic_reads(read1_fastq, read2_fastq): """This function trimms each read line at any uncut restriction enzyme site (GATC) and conserves the lefmost part. Then it output in fasta format. """ # Open 2 files to write out1 = re.sub(r".fastq(\.gz)?", "read1.fasta", read1_fastq) out2 = re.sub(r".fastq(\.gz)?", "read2.fasta", read2_fastq) # Continue if files exist if os.path.exists(out1) & os.path.exists(out2): return [out1, out2] # We cut in enzyme restriction site GATC and make a fasta file with gzopen(read1_fastq) as f, gzopen(read2_fastq) as g, open(out1, "w") as y, open(out2, "w") as z: for lineno, (line1, line2) in enumerate(izip(f, g)): if lineno % 4 != 1: continue seq1 = line1.rstrip().split("GATC")[0] seq2 = line2.rstrip().split("GATC")[0] if len(seq1) > 16 and len(seq2) > 16: y.write(">%d\n" % (lineno / 4)) y.write(seq1 + "\n") z.write(">%d\n" % (lineno / 4)) z.write(seq2 + "\n") print([out1, out2]) return [out1, out2]
def extract_reads_from_PE_fastq(fname_iPCR_PE1, fname_iPCR_PE2): """This function takes the 2 pair-end sequencing files and extracts the barcode making sure that the other read contains the transposon.""" MIN_BRCD = 15 MAX_BRCD = 25 MIN_GENOME = 15 # The known parts of the sequences are matched with a Levenshtein # automaton. On the reverse read, the end of the transposon # corresponds to a 34 bp sequence ending as shown below. We allow # up to 5 mismatches/indels. On the forward read, the only known # sequence is the CATG after the barcode, which is matched exactly. pT2 = seeq.compile('TGTATGTAAACTTCCGACTTCAACTGTA', 5) # Open a file to write fname_fasta = re.sub(r'[\_F][w\_].fastq(\.gz)?', 'iPCR.fasta', fname_iPCR_PE1) # Substitution failed, append '.fasta' to avoid name collision. if fname_fasta == fname_iPCR_PE1: fname_fasta = fname_iPCR_PE1 + '.fasta' # Skip if file exists. if os.path.exists(fname_fasta): return fname_fasta with gzopen(fname_iPCR_PE1) as f, gzopen(fname_iPCR_PE2) as g, \ open(fname_fasta, 'w') as outf: # Aggregate iterator of f,g iterators -> izip(f,g). for lineno, (line1, line2) in enumerate(izip(f, g)): # Take sequence only. if lineno % 4 != 1: continue # Split on "CATG" and take the first fragment. # In case there is no "CATG", the barcode will be rejected # for being too long. brcd = line1.rstrip().split('CATG')[0] if not MIN_BRCD < len(brcd) < MAX_BRCD: continue # Use a Levenshtein automaton to find the transpsoson. genome = pT2.matchSuffix(line2, False) if not genome: continue # Select the region from the end of the transposon to # the first "CATG", if any. genome = genome.split('CATG')[0].rstrip() if len(genome) < MIN_GENOME: continue outf.write('>%s\n%s\n' % (brcd, genome)) return fname_fasta
def parse(cls, fname1, fname2): '''Iterator that yields objects from a pair of fastq files.''' with gzopen(fname1) as f, gzopen(fname2) as g: for lineno, (line1, line2) in enumerate(izip(f, g)): if lineno % 4 == 1: # Read the sequence. read1 = line1.rstrip() read2 = line2.rstrip() if lineno % 4 == 3: # Read the quality and yield the object. qual1 = line1.rstrip() qual2 = line2.rstrip() yield cls(read1, read2, qual1, qual2)
def parse_fq(filename): """ fastq parser """ state = 0 label = None qual = None with gzopen(filename) as f: for line in f: line = line.rstrip("\r\n") if len(line) == 0: continue # fastq if line[0] == "@": if label is not None: yield label, "".join(seq), "".join(qual) state = 1 label = line seq = [] qual = [] continue elif line[0] == "+": state = 2 continue if state == 1: seq.append(line) elif state == 2: qual.append(line) if label is not None: yield label, "".join(seq), "".join(qual)
def binit(bin_size, fname, limits=limits_hg19): getmap = itemgetter(0,2) unique_reads = set([]) counts = defaultdict(int) with gzopen(fname) as f: for line in f: # Fields: read name, sequence, quality, map count, positions(s). item = line.rstrip().split('\t') # Keep only reads with unique map: "0:1" or "1+...". match_uniq = re.search(r'^[0:+]*1(?:\Z|\D)', item[3]) if match_uniq: mapping = item[4] # 'mapping' is like "chr6:+:52132829:2C29". chrom,pos = getmap(mapping.split(':')) # Keep only on read (the first) of a read series. if (chrom,pos) in unique_reads: continue unique_reads.add((chrom,pos)) counts[(chrom,int(pos)/bin_size)] += 1 continue for chrom,size in sorted(limits_hg19.items()): for b in range(size/bin_size): sys.stdout.write("%s\t%d\t%d\t%d\n" % \ (chrom, 1+b*bin_size, (1+b)*bin_size, counts[(chrom,b)]))
def call_starcode_fastq_file(fastq): #pdb.set_trace() MIN_BRCD = 15 MAX_BRCD = 25 brcd_outfname = fname + '_barcodes.tsv' spk_outfname = fname + '_spikes.tsv' GFP = seeq.compile('CATGCTAGTTGTGGTTTGTCCAAACT', 4) SPIKE = seeq.compile('CATGATTACCCTGTTATC', 2) barcode_tempf = tempfile.NamedTemporaryFile(delete=False) spike_tempf = tempfile.NamedTemporaryFile(delete=False) with gzopen(fastq) as f: outf = None for lineno, line in enumerate(f): if lineno % 4 != 1: continue hit = GFP.match(line) if hit is not None: outf = barcode_tempf else: hit = SPIKE.match(line) if hit is not None: outf = spike_tempf else: continue pos = hit.matchlist[0][0] if MIN_BRCD <= pos <= MAX_BRCD: outf.write(line[:pos] + '\n') barcode_tempf.close() spike_tempf.close() subprocess.call([ 'starcode', '-t4', '-i', barcode_tempf.name, '-o', brcd_outfname, ]) subprocess.call([ 'starcode', '-t4', '-i', spike_tempf.name, '-o', spk_outfname, ]) # Delete temporary files. os.unlink(barcode_tempf.name) os.unlink(spike_tempf.name) # Save the names of the files processsed #processed.append([brcd_outfname,spk_outfname]) processed.append(brcd_outfname) spikessed.append(spk_outfname) #pdb.set_trace() return
def extract_reads_from_PE_fastq(fname_iPCR_PE1, fname_iPCR_PE2): """This function takes the 2 pair-end sequencing files and extracts the barcode making sure that the other read contains the transposon.""" MIN_BRCD = 15 MAX_BRCD = 25 MIN_GENOME = 15 # The known parts of the sequences are matched with a Levenshtein # automaton. On the reverse read, the end of the transposon # corresponds to a 34 bp sequence ending as shown below. We allow # up to 5 mismatches/indels. On the forward read, the only known # sequence is the CATG after the barcode, which is matched exactly. pT2 = seeq.compile('TGTATGTAAACTTCCGACTTCAACTGTA', 5) # Open a file to write fname_fasta = re.sub(r'read[1-2].fastq(\.gz)?', 'iPCR.fasta', fname_iPCR_PE1) # Substitution failed, append '.fasta' to avoid name collision. if fname_fasta == fname_iPCR_PE1: fname_fasta = fname_iPCR_PE1 + '.fasta' # Skip if file exists. if os.path.exists(fname_fasta): return fname_fasta with gzopen(fname_iPCR_PE1) as f, gzopen(fname_iPCR_PE2) as g, \ open(fname_fasta, 'w') as outf: # Aggregate iterator of f,g iterators -> izip(f,g). for lineno,(line1,line2) in enumerate(izip(f,g)): # Take sequence only. if lineno % 4 != 1: continue # Split on "CATG" and take the first fragment. # In case there is no "CATG", the barcode will be rejected # for being too long. brcd = line1.rstrip().split('CATG')[0] if not MIN_BRCD < len(brcd) < MAX_BRCD: continue # Use a Levenshtein automaton to find the transpsoson. genome = pT2.matchSuffix(line2, False) if not genome: continue # Select the region from the end of the transposon to # the first "CATG", if any. genome = genome.split('CATG')[0].rstrip() if len(genome) < MIN_GENOME: continue outf.write('>%s\n%s\n' % (brcd,genome)) return fname_fasta
def parse_tsv(fn, ichrom, ipos, chrmap, skip=1): with gzopen(fn) as f: while skip: skip -= 1 next(f) for line in f: fields = line.split() key = (chrmap[fields[ichrom]], int(fields[ipos])) yield key, fields
def extract_reads_from_PE_fastq(fname_iPCR_PE1, fname_iPCR_PE2): """This function takes the 2 pair-end sequencing files and extracts the barcode making sure that the other read contains the transposon.""" MIN_BRCD = 15 MAX_BRCD = 25 MIN_GENOME = 15 # The known parts of the sequences are matched with a Levenshtein # automaton. On the reverse read, the end of the transposon # corresponds to a 34 bp sequence ending as shown below. We allow # up to 5 mismatches/indels. On the forward read, the only known # sequence is the CATG after the barcode, which is matched exactly. # Open a file to write fname_fasta = re.sub(r'[A-Za-z]+_iPCR_([\w]+)_[a-zA-Z0-9]+.fastq', r'iPCR_\1.fasta', fname_iPCR_PE1) # Substitution failed, append '.fasta' to avoid name collision. if fname_fasta == fname_iPCR_PE1: fname_fasta = fname_iPCR_PE1 + '.fasta' # Skip if file exists. if os.path.exists(fname_fasta): return fname_fasta with gzopen(fname_iPCR_PE1) as f, gzopen(fname_iPCR_PE2) as g, \ open(fname_fasta, 'w') as outf: # Aggregate iterator of f,g iterators -> izip(f,g). for lineno, (line1, line2) in enumerate(izip(f, g)): # Take sequence only. if lineno % 4 != 1: continue brcd = line1[:20] if not MIN_BRCD < len(brcd) < MAX_BRCD: continue # Lets relie on bwa mapping results to decide genome = line2.rstrip() if len(genome) < MIN_GENOME: continue outf.write('>%s\n%s\n' % (brcd, genome)) return fname_fasta
def binit(bin_size, input_file, limits=limits_hg19, output_dir='.'): """ select the reads that are mapped uniquely tag the reads that mapped multiple times as repeats (NA) bin into certain window size """ getmap = itemgetter(0,2) unique_maps = set() unique_counts = defaultdict(int) multiple_counts = defaultdict(int) with gzopen(input_file) as f: for line in f: # Fields: read name, sequence, quality, map count, positions(s). item = line.rstrip().split('\t') # keep only reads with unique map # the following scenarios are accepted # 1.. ; 0:0:...:0:1...; 0:0:0+1... if item[4] == '-': continue stratum_size = int(re.search(r'^[0:+]*(\d+)', item[3]).groups()[0]) #match_uniq = re.search(r'^[0:+]*1(?:\Z|\D)', item[3]) thisdict = unique_counts if stratum_size == 1 else multiple_counts for hit in item[4].split(',')[:stratum_size]: #mapping = item[4] # 'mapping' is like "chr1:+:12942:34T1,chr15:-:102518193:34T1" #chrom,pos = getmap(mapping.split(':')) chrom,pos = getmap(hit.split(':')) # Keep only one read (the first) of a read series. if (chrom,pos) in unique_maps: continue unique_maps.add((chrom,pos)) thisdict[(chrom,int(pos)/bin_size)] += 1 #output file if not os.path.exists(output_dir): try: os.makedirs(output_dir) except OSError as exception: if exception.errno != errno.EEXIST: raise head, tail = os.path.split(input_file) base = os.path.splitext(tail)[0] output_fname="%sbin-%s.bed" %(bin_size,base) output_file = str(os.path.join(output_dir, output_fname)) with open(output_file, 'w') as output_f: for chrom,size in sorted(limits_hg19.items()): for b in range(size/bin_size): coord = (chrom,b) mapping = "%s\t%d\t%d\t" % (chrom, 1+b*bin_size, (1+b)*bin_size) unmappable = multiple_counts[coord] and not unique_counts[coord] count = 'NA' if unmappable else str(unique_counts[(chrom,b)]) output_f.write(mapping + count + '\n')
def call_starcode_fastq_file(fastq): #pdb.set_trace() MIN_BRCD = 15 MAX_BRCD = 25 brcd_outfname = fname + '_barcodes.tsv' spk_outfname = fname + '_spikes.tsv' GFP = seeq.compile('CATGCTAGTTGTGGTTTGTCCAAACT', 4) SPIKE = seeq.compile('CATGATTACCCTGTTATC', 2) barcode_tempf = tempfile.NamedTemporaryFile(delete=False) spike_tempf = tempfile.NamedTemporaryFile(delete=False) with gzopen(fastq) as f: outf = None for lineno,line in enumerate(f): if lineno % 4 != 1: continue hit = GFP.match(line) if hit is not None: outf = barcode_tempf else: hit = SPIKE.match(line) if hit is not None: outf = spike_tempf else: continue pos = hit.matchlist[0][0] if MIN_BRCD <= pos <= MAX_BRCD: outf.write(line[:pos] + '\n') barcode_tempf.close() spike_tempf.close() subprocess.call([ 'starcode', '-t4', '-i', barcode_tempf.name, '-o', brcd_outfname,]) subprocess.call([ 'starcode', '-t4', '-i', spike_tempf.name, '-o', spk_outfname,]) # Delete temporary files. os.unlink(barcode_tempf.name) os.unlink(spike_tempf.name) # Save the names of the files processsed #processed.append([brcd_outfname,spk_outfname]) processed.append(brcd_outfname) spikessed.append(spk_outfname) #pdb.set_trace() return
def parse_dbSNP_dump(fn): with gzopen(fn) as f: for line in f: fields = line.split() rs = fields[0] ss = fields[1] chrom = fields[2] pos = int(fields[3]) ref = fields[4] alt = fields[5] yield rs, ss, chrom, pos, ref, alt
def trimm_hic_reads(read1_fastq, read2_fastq): '''This function trimms each read line at any uncut restriction enzyme site (GATC) and conserves the lefmost part. Then it output in fasta format. ''' # Open 2 files to write out1 = re.sub(r'.fastq(\.gz)?', 'read1.fasta', fname1) out2 = re.sub(r'.fastq(\.gz)?', 'read2.fasta', fname2) # We cut in enzyme restriction site GATC and make a fasta file with gzopen(read1_fastq) as f, gzopen(read2_fastq) as g, \ open(out1,'w') as y, open(out2,'w') as z: for lineno, (line1, line2) in enumerate(izip(f, g)): if lineno % 4 != 1: continue seq1 = line1.rstrip().split('GATC')[0] seq2 = line2.rstrip().split('GATC')[0] if len(seq1) > 16 and len(seq2) > 16: y.write('>%d\n' % (lineno / 4)) y.write(seq1 + '\n') z.write('>%d\n' % (lineno / 4)) z.write(seq2 + '\n') return (out1, out2)
def parse_methlist(filename): with gzopen(filename) as f: next(f) # skip header # rintf("chrom\tpos-0\tpos-1\tstrand\tdepth\tC\tmC\tcontext\n"); for line in f: line = line.rstrip() fields = line.split("\t") chrom = fields[0] start, end = map(int, fields[1:3]) strand = fields[3] depth, C, mC = map(int, fields[4:7]) context = fields[7] yield chrom, start, end, strand, depth, C, mC, context
def parse_trusted(fn): rev = {"ADBR": True, "ARBD": False} with gzopen(fn) as f: for row in csv.reader(f): snp_num = int(row[0]) chrom = row[1] if chrom == "30": chrom = "X" pos = int(row[2]) ref = row[3] alt = row[4] key = row[5] yield snp_num, chrom, pos, ref, alt, rev[key]
def file2ngram_info(infile, min_len, max_len): """ Given a d3_feats file, return a list of tab separated strings of the form: <ngram_length> <canonicalized term> <surface term> <doc_id> <pos_signature> e.g., 3 epitaxial silicon process epitaxial silicon processes 000171485800006 JNN NOTE: All elements are returned as strings, including the <ngram_length> min_len and max_len constrain the length of ngrams to be included in output. """ #print("[file2ngram_info] %s" % infile) /// s_infile = gzopen.gzopen(infile) # list of lists of info to be returned for each line of input file l_term_info = [] for line in s_infile: line = line.strip("\n") l_fields = line.split("\t") filename = l_fields[0] doc_id = path_base_name(filename) term = l_fields[2] ngram_len = len(term.split(" ")) # continue if conditions for the term are met (ngram length and filter check) if (ngram_len >= min_len) and (ngram_len <= max_len) and not(canon.illegal_phrase_p(term)) : canon_np = can.get_canon_np(term) # We assume that the last feature on the line is tag_sig! pos_sig = l_fields[-1] if pos_sig[:7] != "tag_sig": print ("[ngram_extract.py]Error: last feature on input line is not labeled tag_sig") print ("line: %s" % line) sys.exit() else: # replace pos_sig with a string made of the first char of each pos in the phrase # e.g. JJ_NN_NNS => JNN pos_sig = "".join(item[0] for item in pos_sig[8:].split("_")) prev_Npr = "" prev_N = "" # grab the prev_Npr feature, if there is one try: # extract the value of the prev_Npr feature, if there is one. match = re.search(r'prev_Npr=(\S+) ', line) prev_Npr = match.group(1) # canonicalize the noun prev_N = can.get_canon_np(prev_Npr.split("_")[0]) except: pass l_term_info.append([str(ngram_len), canon_np, term, doc_id, pos_sig, prev_Npr, prev_N]) s_infile.close() return(l_term_info)
def parse_snp50_csv(fn): with gzopen(fn) as f: for row in csv.DictReader(f): snp_num = int(row["snp_number"]) ss = row["ss_id"] rs = row["rs_id"] if rs: rs = "rs" + rs chrom = row["umd30_bta"] if chrom == "30": chrom = "X" pos = row["umd30_pos"] try: pos = int(pos) except ValueError: pass yield snp_num, ss, rs, chrom, pos
def merge(fnamelist): flist = [gzopen(fname) for fname in fnamelist] # Get names and print header. idexpr = r'-(\d{3}[a-z]?)' IDs = [re.search(idexpr, fname).group(1) for fname in fnamelist] sys.stdout.write('seqname\tstart\tend\t' + '\t'.join(IDs) + '\n') # Iterate through all the files at the same time with 'izip'. for linetuple in izip(*flist): # Extract seqname, start and end from first file. mapping = '\t'.join(linetuple[0].split()[:3]) # Extract 4-th column and print. entries = '\t'.join([line.split()[3] for line in linetuple]) sys.stdout.write(mapping + '\t' + entries + '\n') for f in flist: f.close()
def main(mapfile): with gzopen(mapfile) as f: for line in f: items = line.split() brcd = items[0].split(':')[0] # The character '-' at the end of the line indicates # that there is no hit for the sequence. if items[-1] == '-': brcd_counter[brcd][no_hits] += 1 continue # In case that there are several hits, they will be # separated by ",". try: (loc, ) = items[-1].split(',') except ValueError: brcd_counter[brcd][many_hits] += 1 continue (chrom, strand, pos, ignore) = loc.split(':') brcd_counter[brcd][(chrom, pos)] += 1 pos_counter[(chrom, pos)][brcd] += 1 # Find the Charlies. Charlies = set() for (pos, counts) in pos_counter.items(): try: ((brcd1, alpha), (brcd2, beta)) = counts.most_common(2) except ValueError: continue if alpha < 8 * beta: Charlies.add(pos) print pos, counts print '----' # Find the Bobs. Bobs = set() for (brcd, counts) in brcd_counter.items(): try: ((pos1, alpha), (pos2, beta)) = counts.most_common(2) except ValueError: ((pos1, alpha), ) = counts.most_common(1) beta = 0 if alpha < 8 * beta: Bobs.add(brcd) elif pos1 not in position_unknown: print brcd, pos1, alpha
def parse_snp_chip_txt(fn): data = {} lineno = 0 with gzopen(fn) as f: lineno += 1 hline = next(f).rstrip("\r\n") headers = hline.split("\t")[1:] for line in f: line = line.rstrip("\r\n") fields = line.split("\t") snp_num = int(fields[0]) data[snp_num] = fields[1:] if len(data[snp_num]) != len(headers): raise ParseError( "Error: {}: line {}: mismatch between number of headers ({}) and data array length ({})." .format(filename, lineno, len(headers), len(data[snp_num]))) return headers, data
def get_Unmap(input_file, output_dir='.'): """ get the unmapped reads from the gemMap output """ try: with gzopen(input_file) as in_f: gc.disable() reads_unmapped = [] for line in in_f: items= line.rstrip().split('\t') if items[3] =="0:1": # collect all the fasqt elements according to the fastq format (ENCODE) # 1. the ids of the reads # 2. the read seq # 3. the strand + the ids # 4. the quality code # the unmapped regions of the gemMap are given '+' strand(assumption) fastq_items = [] fastq_items.extend(["@"+items[0], items[1], "+"+items[0], items[2]]) reads_unmapped.append(fastq_items) gc.enable() except: sys.stderr.write('file error:%s'%(os.path.splitext(input_file)[0])) raise # dump for the output... if not os.path.exists(output_dir): try: os.makedirs(output_dir) except OSError as exception: if exception.errno != errno.EEXIST: raise head, tail = os.path.split(input_file) base = os.path.splitext(tail)[0] output_fname_unmap ="unmapped-%s.fastq" %(base) output_file_unmap = str(os.path.join(output_dir, output_fname_unmap)) with open(output_file_unmap, 'w') as output_f_unmap: for line in reads_unmapped: for items in line: output_f_unmap.write(items+'\n')
def dir2features_count(filelist_file, out_root, sections, year, overwrite_p, max_doc_terms_count=1000, canonicalize_p=True, filter_noise_p=True): #pdb.set_trace() out_path = "/".join([out_root, sections]) out_path_prefix = "/".join([out_path, year]) # term-feature output file tf_file = out_path_prefix + ".tf" # remember the mapping between surface head nouns and their canonicalized forms canon_file = out_path_prefix + ".canon" # create the outpath if it doesn't exist yet print("[act_tf.py]creating path: %s,\n[act_tf.py]writing to %s" % (out_path, tf_file)) try: # create directory path for corpus, if it does not aleady exist os.makedirs(out_path) except: print("[act_tf.py]NOTE: Path already exists (or cannot be created).") # Do not continue if the .tf file already exists for this corpus and year if os.path.isfile(tf_file) and not overwrite_p: print "[tf.py]file already exists: %s. No need to recompute." % tf_file else: terms_file = out_path_prefix + ".terms" feats_file = out_path_prefix + ".feats" corpus_size_file = out_path_prefix + ".cs" doc_terms_file = out_path_prefix + ".doc_terms" # store each filename with a list of its terms s_doc_terms_file = codecs.open(doc_terms_file, "w", encoding='utf-8') # count of number of docs a term pair cooccurs in # dfreq is document freq, cfreq is corpus freq #d_pair_freq = defaultdict(int) d_pair2dfreq = defaultdict(int) # corpus count for the pair d_pair2cfreq = defaultdict(int) # count of number of docs a term occurs in #d_term_freq = defaultdict(int) d_term2dfreq = defaultdict(int) # count of number of instances of a term #d_term_instance_freq = defaultdict(int) d_term2cfreq = defaultdict(int) # count of number of instances of a feature #d_feat_instance_freq = defaultdict(int) d_feat2cfreq = defaultdict(int) # count of number of docs a feature occurs in #d_feat_freq = defaultdict(int) d_feat2dfreq = defaultdict(int) # doc_count needed for computing probs doc_count = 0 # open list of all the files in the inroot directory s_filelist = open(filelist_file) #print "inroot: %s, filelist: %s" % (inroot, filelist) # iterate through files in filelist for infile in s_filelist: infile = infile.strip("\n") # Create a tab separated string containing the filename and all (legal) canonicalized terms, including # duplicates. This will be used to populate a doc_term retrieval system in # elasticsearch. # First field will be the filename. # At this point, we'll collect the filename and terms into a list. # The file without path or extensions should be a unique doc id. doc_id = os.path.basename(infile).split(".")[0] doc_terms_list = [doc_id] # dictionaries to sum up statistics # number of times a term appears in the doc d_term2count = defaultdict(int) d_feat2count = defaultdict(int) # number of times a term appears with a specific feature in the doc d_pair2count = defaultdict(int) # process the dictionaries # for each file, create a set of all term-feature pairs in the file #/// dictionaries are functionally redundant with sets here. # Use sets to capture which terms, features, and pairs occur in the # document. We'll use this after processing each doc to update the # doc frequencies of terms, features, and pairs. pair_set = set() term_set = set() feature_set = set() #pdb.set_trace() s_infile = gzopen.gzopen(infile) # count number of lines in file i = 0 # iterate through lines in d3_feats file for term_line in s_infile: i += 1 term_line = term_line.strip("\n") l_fields = term_line.split("\t") term = l_fields[2] # Do not process noise (illegal) terms or features # for cases where feat = "", need to filter! todo #pdb.set_trace() if (filter_noise_p and canon.illegal_phrase_p(term)): pass # eliminate lines that come from claims section of patents. # These are not very useful and skew term frequency counts. # We do this by eliminating lines containing the feature section_loc=CLAIM*. if ("=CLAIM" in term_line): pass # NOTE: At the moment we don't test which sections of the doc should be included # as specified by the sections parameter (ta or tas). We include every line. If # we decide to add this functionality, this would be the place to add the filter. else: if canonicalize_p: # Do canonicalization of term before incrementing counts #feature = can.get_canon_feature(feature) term = can.get_canon_np(term) # increment the within doc count for the term ##d_term2count[term] += 1 term_set.add(term) # increment the global corpus count for the term d_term2cfreq[term] += 1 # Add the term to the list of terms for the current doc # Ideally, we would like to ignore parts of a patent (e.g. the claims) and # just use the title, abstract and summary. However, there is no feature # indicating what section we are in beyond the abstract. So instead, we # will use a simple doc_terms_count cut off (e.g. 1000). Variable i counts # the number of lines so far. #pdb.set_trace() if (i <= max_doc_terms_count) and ( term not in DOC_TERMS_NOISE ) and not canon.illegal_phrase_p(term): doc_terms_list.append(term) # fields 3 and beyond are feature-value pairs # look for features of interest using their prefixes for feature in l_fields[3:]: # Note that we use the prefixes of some feature names for convenience. # The actual features are prev_V, prev_VNP, prev_J, prev_Jpr, prev_Npr, last_word # first_word, if an adjective, may capture some indicators of dimensions (high, low), although # many common adjectives are excluded from the chunk and would be matched by prev_J. # we also pull out the sent and token locations to allow us to locate the full sentence for this # term-feature instance. if (feature[0:6] in [ "prev_V", "prev_J", "prev_N", "last_w" ]) and not canon.illegal_feature_p(feature): if canonicalize_p and not "-" in feature: # Do canonicalization of feature before incrementing counts. # NOTE: There is a bug in the canonicalization code when the # term contains hyphens. For example: # >>> can.get_canon_feature("last_word=compass-on-a-chip") # Returns a term with a blank in it: 'last_word=compas-on-a chip' # for this reason, we will not try to canonicalize terms containing # a hyphen. feature = can.get_canon_feature(feature) # increment global corpus count for the feature d_feat2cfreq[feature] += 1 feature_set.add(feature) # increment global corpus count for the pair d_pair2cfreq[(term, feature)] += 1 # increment the within doc count for the term feature pair ##d_pair2count[(term, feature)] += 1 pair_set.add((term, feature)) # construct a tab-separated string containing file_name and all terms doc_terms_str = "\t".join(doc_terms_list) s_doc_terms_file.write("%s\n" % doc_terms_str) s_infile.close() # Using the sets, increment the doc_freq for term-feature pairs in the doc. # By making the list a set, we know we are only counting each term-feature combo once # per document for pair in pair_set: d_pair2dfreq[pair] += 1 # also increment doc_freq for features and terms for term in term_set: d_term2dfreq[term] += 1 for feature in feature_set: d_feat2dfreq[feature] += 1 # track total number of docs doc_count += 1 s_filelist.close() s_tf_file = codecs.open(tf_file, "w", encoding='utf-8') s_terms_file = codecs.open(terms_file, "w", encoding='utf-8') s_feats_file = codecs.open(feats_file, "w", encoding='utf-8') print "[act_tf.py]Writing to %s" % tf_file # compute prob print "[act_tf.py]Processed %i files" % doc_count for pair in d_pair2dfreq.keys(): freq_pair = d_pair2dfreq[pair] prob_pair = float(freq_pair) / doc_count term = pair[0] feature = pair[1] freq_term = d_term2dfreq[term] freq_feat = d_feat2dfreq[feature] # Occasionally, we come across a term in freq_pair which is not actually in # the dictionary d_term2dfreq. It returns a freq of 0. We need to ignore these # cases, since they will create a divide by 0 error. if freq_term > 0 and freq_feat > 0: # probability of the feature occurring with the term in a doc, given that # the term appears in the doc try: prob_fgt = freq_pair / float(freq_term) except: pdb.set_trace() # added 4/4/15: prob of the feature occurring with the term in a doc, given that # the feature appears in the doc try: prob_tgf = freq_pair / float(freq_feat) except: pdb.set_trace() # 4/18/15 adding mutual information based on count of pairs, terms, feats (counted once per doc), # and corpus size (# docs) # MI = prob(pair) / prob(term) * prob(feature) #prob_term = float(d_term2dfreq[term])/doc_count #prob_feature = float(d_feat2dfreq[term])/doc_count mi_denom = (freq_term) * (freq_feat) / float(doc_count) mi = math.log(freq_pair / mi_denom) # normalize to -1 to 1 # Note: if prob_pair == 1, then log is 0 and we risk dividing by 0 # We'll prevent this by subtracting a small amt from prob_pair if prob_pair == 1: prob_pair = prob_pair - .000000001 npmi = mi / (-math.log(prob_pair)) s_tf_file.write("%s\t%s\t%i\t%f\t%f\t%f\t%i\t%i\t%f\t%f\n" % (term, feature, freq_pair, prob_pair, prob_fgt, prob_tgf, freq_term, freq_feat, mi, npmi)) else: # print out a warning about terms with 0 freq. print "[act_tf.py]WARNING: term-feature pair: %s has freq = 0. Ignored." % l_pair for term in d_term2dfreq.keys(): term_prob = float(d_term2dfreq[term]) / doc_count s_terms_file.write( "%s\t%i\t%i\t%f\n" % (term, d_term2dfreq[term], d_term2cfreq[term], term_prob)) for feat in d_feat2dfreq.keys(): feat_prob = float(d_feat2dfreq[feat]) / doc_count s_feats_file.write( "%s\t%i\t%i\t%f\n" % (feat, d_feat2dfreq[feat], d_feat2cfreq[feat], feat_prob)) s_canon_file = codecs.open(canon_file, "w", encoding='utf-8') for key, value in can.d_n2canon.items(): # Only write out a line if the canonical form differs from the surface form if key != value: s_canon_file.write("%s\t%s\n" % (key, value)) s_canon_file.close() s_tf_file.close() s_terms_file.close() s_feats_file.close() s_doc_terms_file.close() # Finally, create a file to store the corpus size (# docs in the source directory) cmd = "ls -1 " + filelist_file + " | wc -l > " + corpus_size_file s_corpus_size_file = open(corpus_size_file, "w") s_corpus_size_file.write("%i\n" % doc_count) s_corpus_size_file.close() print "[act_tf.py dir2features_count]Storing corpus size in %s " % corpus_size_file
import fileinput import os import pdb import re import seeq import sys import subprocess import tempfile from collections import defaultdict from gzopen import gzopen from itertools import izip TOMAPfname = sys.argv[1] + '_2map' #pdb.set_trace() with gzopen(sys.argv[1]) as f, open(TOMAPfname,'w') as g: for lineno,line in enumerate(f): # Is a fastq keep only sequence if lineno % 4 != 1: continue # Exact search of NlaIII brcd = line.rstrip().split('CATG')[0] if len(brcd) == len(line.rstrip()): continue seq = line.rstrip().split('CATG')[1] # Cut if there is a MlucI site dna = seq.split('AATT')[0] # Write fasta to map it if not 10 < len(brcd) < 22 : continue if not 5 < len(dna): continue g.write('>%s\n%s\n' % (brcd,dna)) # Map the sequences
# Sets up arguments for user input def setupParser(): # Set up argument parser parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", action="store", help="fastq.gz file you want to convert", type=str) parser.add_argument("-o", "--output", action="store", help="Name of the file you want the converted file.", type=str) args = parser.parse_args() return (args) # Code starts here if __name__ == "__main__": args = setupParser() input_file, output_file, sample_index = parseArguments(args) with gzopen.gzopen(input_file) as input_fastq: with gzip.open(output_file, "wb") as output_fastq: for line in line_chunk: parsed_line = parseLine(line, sample_index=sample_index) output_fastq.write(parsed_line) print "Conversion of %s complete!" % input_file
items = line.split() sgRNA = items[9] nreads = items[0].split('_')[-1] chrom = items[2] pos = int(items[3]) if chrom not in dTree: # Do not consider alternate chromosomes # do not add them to 'dNread'. continue seen = set() for hit in dTree[chrom].query(pos): if hit.data in seen: continue # Every hit is a gene intersected by the position # of the gRNA (more specifically one exon of the # gene is intersected by the position of the gRNA). sys.stdout.write('%s\t%s\t%s\n' % (sgRNA, hit.data, nreads)) seen.add(hit.data) if __name__ == '__main__': sys.setrecursionlimit(10000) # Prepare transcript-gene lookup with gzopen(sys.argv[1]) as f: cast = read_dict(f) # Convert exon locations to interval tress for fast search. with gzopen(sys.argv[2]) as f: dTree = exons_to_interval_tree(f, cast) # Intersect read positions with exons. with gzopen(sys.argv[3]) as f: map_reads_to_exons(f, dTree)
#! -*- coding:utf-8- -*- import re import sys from gzopen import gzopen with gzopen(sys.argv[1]) as f: for line in f: shit,score = line.rstrip('\n').split('\t') if int(score) < 11: continue pair = re.sub("[')(]", '', shit).replace(' ', '_').split(',_') sys.stdout.write('%s (u) %s = %s\n' % (pair[0], pair[1], score)) #sys.stdout.write('%s u %s\n' % tuple(pair))
def call_starcode_on_fastq_file(fname_fastq): ''' Extracts the gDNA,cDNA reads and spikes and runs stracode on them.''' MIN_BRCD = 15 MAX_BRCD = 25 brcd_outfname = re.sub(r'\.fastq.*', '_starcode.txt', fname_fastq) spk_outfname = re.sub(r'\.fastq.*', '_spikes_starcode.txt', fname_fastq) if brcd_outfname == fname_fastq: brcd_outfname = fname_fastq + '_starcode.txt' if spk_outfname == fname_fastq: spk_outfname = fname_fastq + '_spikes_starcode.txt' if os.path.exists(brcd_outfname) and os.path.exists(spk_outfname): return (brcd_outfname, spk_outfname) GFP = seeq.compile('CATGCTAGTTGTGGTTTGTCCAAACT', 4) SPIKE = seeq.compile('CATGATTACCCTGTTATC', 2) barcode_tempf = tempfile.NamedTemporaryFile(delete=False) spike_tempf = tempfile.NamedTemporaryFile(delete=False) with gzopen(fname_fastq) as f: outf = None for lineno,line in enumerate(f): if lineno % 4 != 1: continue hit = GFP.match(line) if hit is not None: outf = barcode_tempf else: hit = SPIKE.match(line) if hit is not None: outf = spike_tempf else: continue pos = hit.matchlist[0][0] if MIN_BRCD <= pos <= MAX_BRCD: outf.write(line[:pos] + '\n') barcode_tempf.close() spike_tempf.close() # Skip if file exists. if not os.path.exists(brcd_outfname): # Call `starcode`. subprocess.call([ 'starcode', '-t4', '-i', barcode_tempf.name, '-o', brcd_outfname, ]) if not os.path.exists(spk_outfname): subprocess.call([ 'starcode', '-t4', '-i', spike_tempf.name, '-o', spk_outfname, ]) # Delete temporary files. os.unlink(barcode_tempf.name) os.unlink(spike_tempf.name) return (brcd_outfname, spk_outfname)
def parse_asn(fn, assembly): def subsplit(lst, tok="="): sdict = dict() for elm in lst: elm = elm.strip() subfields = elm.split(tok) if len(subfields) == 1: sdict[elm] = True elif len(subfields) == 2: sdict[subfields[0]] = subfields[1] return sdict rs = ss = chrom = pos = ref = alt = None indel = False with gzopen(fn) as f: # skip header next(f) next(f) next(f) for line in f: line = line.strip() if line == "": # next entry if None not in (rs, ss, chrom, pos): yield rs, ss, chrom, pos, ref, alt rs = ss = chrom = pos = ref = alt = None indel = False continue if indel: continue fields = line.split(" | ") if fields[0].startswith("rs"): subfields = subsplit(fields[1:]) if "snp" in subfields: rs = fields[0] else: indel = True elif fields[0].startswith("ss"): if ss is None: ss = [fields[0]] else: ss.append(fields[0]) elif fields[0] == "SNP": subfields = subsplit(fields[1:]) alleles = subfields.get("alleles") if alleles is not None: if len(alleles) == 5: # biallelic string, e.g. 'A/G' ref, alt = alleles[1], alleles[3] elif fields[0] == "CTG" and chrom is None: subfields = subsplit(fields[1:]) if subfields.get("assembly") == assembly: chrom = subfields.get("chr") pos = int(subfields.get("chr-pos", -1)) # last entry if None not in (rs, ss, chrom, pos): yield rs, ss, chrom, pos, ref, alt
def parse_col0(fn): with gzopen(fn) as f: for line in f: yield line.split(None, 1)[0]
if args.methylkit: ichrom = 1 ipos = 2 elif args.pileOmeth: ichrom = 0 ipos = 1 else: raise Exception("Are we doing methylkit or pileOmeth?") chrmap = {s: i for i, s in enumerate(parse_col0(args.chr_list))} f1 = parse_tsv(args.in1, ichrom, ipos, chrmap) f2 = parse_tsv(args.in2, ichrom, ipos, chrmap) l1 = l2 = oline = None with gzopen("/dev/stdout", "w", args.gzip) as fout: try: k1, l1 = next(f1) k2, l2 = next(f2) while True: if k1 < k2: oline = l1 l1 = None k1, l1 = next(f1) elif k2 < k1: oline = l2 l2 = None k2, l2 = next(f2) else: if args.methylkit: oline = l1[:4]
def binit(ref_limits, bin_size, mismatch, input_file, output_dir='.'): """ select the reads that are mapped uniquely, allowing three mismatches bin into certain window size """ try: # import dict containing chromosomes size # limits = {'chr1': 1898309, 'chr2': 2902930, ...} limits =__import__("limits") #limits.py is the file containing limits dict limits = limits.__dict__.get(ref_limits) #choose which species, e.g hg19 , mm10 freq_table = defaultdict(int) except: raise try: f = gzopen(input_file) for line in f: # Fields: read name, sequence, quality, map count, positions(s). item = line.rstrip().split('\t') #mapping = item[4] if item[4] == '-': continue #check for mismatches # e.g. 0:0:1 = allowing 2 mismatches (2 zeros) no_mismatch = item[3].count("0") if no_mismatch <= mismatch: # 'mapping' is like "chr1:+:12942:34T1,chr15:-:102518193:34T1" getmap = item[4].split(":") chrom = getmap[0] start = getmap[2] freq_table[(chrom, int(start)/bin_size)] += 1 except IndexError: sys.stderr.write("Check if your file is properly formatted, !field separator is a tab!") except: sys.stderr.write('file error:%s'%(in_f.name)) raise finally: f.close() #collect all the aligned chromosome without duplicates chroms = set([chrom for (chrom,bin) in freq_table.keys()]) #start binning ... # remove the garbage collector during list append gc.disable() bin_list=[] for chrom in sorted(limits): chrom_size = int(limits[chrom]) maxbin = int(chrom_size/bin_size) for bin in range(maxbin): bin_list.append("%s\t%s\t%s\t%d\n" % \ (chrom, 1+bin*bin_size, (1+bin)*bin_size, freq_table[(chrom,bin)])) gc.enable() #finally output the file if not os.path.exists(output_dir): try: os.makedirs(output_dir) except OSError as exception: raise head, tail = os.path.split(input_file) base = os.path.splitext(tail)[0] output_fname="%sbin-%s.bed" %(bin_size,base) output_file = str(os.path.join(output_dir, output_fname)) with open(output_file, 'w') as output_f: for line in bin_list: output_f.write(line)
# -*- coding:utf-8 -*- # This script takes 5'UTR regions of every Drosophila annotated gene in flybase to associate it with experimental modENCODE TSS's from 5' Race experiments import re import sys from gzopen import gzopen records = {} # First we create the header of the file sys.stdout.write('FBgnID\tchromosome\tstrand\tstartUTR\tendUTR\n') # Open the Fasta file from Flybase with all the annotated 5'UTR's with gzopen('dmel-all-five_prime_UTR-r5.52.fasta.gz') as f: for line in f: # We work with the Fasta header of each entry: # >FBtr0086024 type=five_prime_untranslated_region; loc=2R:complement(1946941..1947063); # name=CG7856-RA; MD5=0e29152561825b6636f4f7408d1ccfbb; length=123; parent=FBgn0033056; release=r5.52; species=Dmel; if line[0] != '>': continue (chrom, ori, start, end, parent) = re.search( r'loc=([^:]+):(join\(|complement\()?(\d+)\.[\d.,]*\.(\d+).*parent=(FBgn\d{7})', line).groups() strand = '-' if ori == 'complement(' else '+' # Keep only the shortest 5'UTR for a given start position. realstart = start if strand == '+' else end if records.has_key((parent, realstart)): record = records[(parent, realstart)] if int(end) - int(start) > int(record[4]) - int(record[3]): continue
def vcf_parser(filename, yield_headers=False, yield_samples=True, yield_genotypes=True, parse_genotypes=True): """ Parse vcf. Pyvcf does not meet my needs. Yields specified vcf lines. Assumes the format is the same for every line. """ samples = None with gzopen(filename) as f: # parse header for lineno, line in enumerate(f, 1): if yield_headers: yield line.rstrip("\r\n") if not line.startswith("#"): break if line.startswith("#CHROM"): # column header line # #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample0 [sample1 ...] line = line.rstrip("\r\n") fields = line.split("\t") if len(fields) < 9: raise ParseError( "{}: line {}: expected at least 9 columns for vcf column header." .format(filename, lineno)) samples = fields[9:] break if yield_samples: if samples is None: raise ParseError( "{}: no vcf column header found.".format(filename)) yield samples if not yield_genotypes: return genotypes = collections.defaultdict(dict) fmt_fields = None # parse variants for line in f: lineno += 1 line = line.rstrip("\r\n") fields = line.split("\t") chrom = fields[0] pos = int(fields[1]) id = fields[2] ref = fields[3] alt = fields[4] qual = fields[5] filter = fields[6] info_str = fields[7] format_str = fields[8] genotype_str_list = fields[9:] if chrom.startswith("chr"): chrom = chrom[3:] if not parse_genotypes: yield (lineno, chrom, pos, id, ref, alt, qual, filter, info_str, format_str, genotype_str_list) continue if fmt_fields is None: fmt_fields = format_str.split(":") for sample, genotype_str in zip(samples, genotype_str_list): gen_fields = genotype_str.split(":") for fmt, gen in zip(fmt_fields, gen_fields): genotypes[sample][fmt] = gen yield (lineno, line, chrom, pos, id, ref, alt, qual, filter, info_str, fmt_fields, genotypes)
if __name__ == "__main__": args = parse_args() mk_files = [None, None, None] pm_files = [None, None, None] if args.gzip: suffix = ".gz" else: suffix = "" if args.methylkit: if args.cpg: mk_files[0] = gzopen( "{}.methylkit.CpG.txt{}".format(args.oprefix, suffix), "w") if args.chg: mk_files[1] = gzopen( "{}.methylkit.CHG.txt{}".format(args.oprefix, suffix), "w") if args.chh: mk_files[2] = gzopen( "{}.methylkit.CHH.txt{}".format(args.oprefix, suffix), "w") for f in mk_files: if f is not None: print("chrBase\tchr\tbase\tstrand\tcoverage\tfreqC\tfreqT", file=f) if args.pileOmeth: if args.cpg: pm_files[0] = gzopen(
fn_re.match(f)] # The reads from the qMiseq come separated in 4 lanes. Merge them. for sample in samples: # Ex. PromoterA12_S12_L001_R1_001.fastq.gz fn_items = sample.split('_') outfname = fn_items[0] + '.fastq' if os.path.isfile(outfname): sys.stdout.write("Sample: %s already processed, skiping.\n" % outfname.split('/')[-1]) continue lanes = ['L001','L002','L003','L004'] for lane in lanes: if lane == 'L001': toextract = '_'.join([fn_items[0],fn_items[1],lane, fn_items[3],fn_items[4]]) with gzopen(sample) as f, open(outfname,'w') as g: for line in f: g.write(line) else: toextract = '_'.join([fn_items[0],fn_items[1],lane, fn_items[3],fn_items[4]]) with gzopen(sample) as f, open(outfname,'a') as g: for line in f: g.write(line)
#!/usr/bin/python # -*- coding:utf-8 -*- import re import sys import tempfile from gzopen import gzopen BYTES = 65536 s = int(sys.argv[1]) pad = ''.join(['#']*36) with tempfile.TemporaryFile() as temp: # Make a temp fasta file without newline on the sequence. with gzopen(sys.argv[2]) as f: txt = f.read(BYTES) while txt != '': while '>' in txt: txt += f.read(BYTES) header = re.search(r'\n?>[^\n]+\n', txt) temp.write(txt[:header.start()].replace('\n', '')) temp.write(txt[header.start():header.end()]) txt = txt[header.end():] temp.write(txt.replace('\n', '')) txt = f.read(BYTES) # Reset temp file and read line by line. temp.seek(0) for line in temp: if line[0] == '>':
import fileinput import os import pdb import re import seeq import sys import subprocess import tempfile from collections import defaultdict from gzopen import gzopen from itertools import izip TOMAPfname = sys.argv[1] + '_2map' #pdb.set_trace() with gzopen(sys.argv[1]) as f, open(TOMAPfname, 'w') as g: for lineno, line in enumerate(f): # Is a fastq keep only sequence if lineno % 4 != 1: continue # Exact search of NlaIII brcd = line.rstrip().split('CATG')[0] if len(brcd) == len(line.rstrip()): continue seq = line.rstrip().split('CATG')[1] # Cut if there is a MlucI site dna = seq.split('AATT')[0] # Write fasta to map it if not 10 < len(brcd) < 22: continue if not 5 < len(dna): continue g.write('>%s\n%s\n' % (brcd, dna)) # Map the sequences
cst = seeq.compile(r'CGCACTAATGAATTCGTTGCA', 4) GATCGATC = seeq.compile(r'GATCGATC', 1) for line in f: # First remove the constant part, keep the left part # with oligo-specific nucleotides plus GATCGATC, and # keep the UMI on the right. try: oligo, ignore, umi = cst.match(line.rstrip()).tokenize() # Target length is 32. Allow at most 2 indels. if not 30 <= len(oligo) <= 34: continue except (ValueError, AttributeError): continue # Then split the oligo part to extract GATCGATC try: start, end, ignore = GATCGATC.match(oligo[10:22]).matchlist[0] except AttributeError: continue brcd = oligo[:10 + start] + oligo[10 + end:] readout = oligo[10 + start:10 + end] # Output fingerprint and GATCGATC fingerprint = brcd + 'AGATACAGAGATAATACA' + umi sys.stdout.write('%s\t%s\n' % (fingerprint, readout)) if __name__ == '__main__': with gzopen(sys.argv[1]) as f: extract_fingerprint_and_GATCGATC(f)
def Rvtests_bin(Sumstat, fasta): with gzopen(Sumstat ) as stats: # call gzoopen to open file or file.zip on the fly OUT = open(str(Sumstat.split("/")[-1] + ".Tidy"), "w") # output genome = pysam.Fastafile( fasta ) # load fasta file only needs to be done once as it writes the index count = 0 # number of variants count_mm = 0 #mismatches count_m = 0 #matches for line in stats: if line.startswith("##") or line.startswith( "#G" ): # rv tests has drivel at the start and the genomic lambda may be the last line continue elif line.startswith("CHROM") or line.startswith( "#CHROM" ): # This part will be used to inform how the transformations happen based on what package is used. ie different headers identify different packages check_tool(line) li_pos = line.strip().split("\t") li_pos.append("MAF_PH") li_pos.append("MAC_PH") li_pos.append("EAC_PH") li_pos.append("EAF_PH") li_pos[BETA] = "OR" li_pos.append("\n") OUT.write("\t".join(li_pos)) # write out else: li_pos = line.strip("\n").split("\t") bp_pos = li_pos[1] #base position chr_n = li_pos[0] #chr number stat_ref = li_pos[2] # reference or non effect allele if ((chr_n.isdigit() and int(chr_n) == 23) or chr_n == "X") and monomorphic_filter( li_pos) is False: # for the X chromosome. chr_n = "X" # rename chr23 gen_ref = genome.fetch( chr_n, int(bp_pos) - 1, int(bp_pos) ) ## query the reference genome using pysam. if stat_ref != gen_ref: # output statistics for switched Ref/Alt count_mm = count_mm + 1 # update mismatch counter li_pos[2] = gen_ref # swap references li_pos[3] = stat_ref # swap alternative li_pos[BETA] = str(Beta2ORinvert( li_pos, md5)) # Update BETA to OR li_pos.append(MAF(li_pos)) # update MAF li_pos.append(MAC(li_pos)) # update MAC li_pos[N_ref], li_pos[N_alt] = li_pos[N_alt], li_pos[ N_ref] # Swap genotype counts when misaligned Important to do this before EAC li_pos.append(EAC(li_pos)) # update EAC li_pos.append(str( (1 - float(li_pos[AF])))) # Update EAF li_pos.append("\n") # require OUT.write( "\t".join(li_pos)) # write to new sumstat file elif stat_ref == gen_ref: # Update Statistics where Ref/Alt non switched li_pos.append(MAF(li_pos)) li_pos.append(MAC(li_pos)) li_pos.append(EAC(li_pos)) li_pos.append((str(float(li_pos[AF])))) li_pos[BETA] = str(Beta2OR(li_pos, md5)) #BETA->OR count_m = count_m + 1 # update matches li_pos.append("\n") OUT.write("\t".join(li_pos)) else: continue elif chr_n.isdigit( ) and int(chr_n) > 23: # for non autosomal & sex chromosomes continue elif (chr_n.isdigit( ) and int(chr_n) < 23) and monomorphic_filter( li_pos ) is False: # This is the body of the computational part all autosomes gen_ref = genome.fetch(chr_n, int(bp_pos) - 1, int(bp_pos)) # Query reg genome if stat_ref != gen_ref: count_mm = count_mm + 1 li_pos[2] = gen_ref # swap references li_pos[3] = stat_ref # swap alternatives li_pos[BETA] = str(Beta2ORinvert(li_pos, md5)) li_pos.append(MAF(li_pos)) li_pos.append(MAC(li_pos)) li_pos[N_ref], li_pos[N_alt] = li_pos[N_alt], li_pos[ N_ref] li_pos.append(EAC(li_pos)) li_pos.append(str(1 - float(li_pos[AF]))) li_pos.append("\n") OUT.write("\t".join(li_pos)) elif stat_ref == gen_ref: li_pos.append(MAF(li_pos)) li_pos.append(MAC(li_pos)) li_pos.append(EAC(li_pos)) li_pos.append((str(float(li_pos[AF])))) li_pos[BETA] = str(Beta2OR(li_pos, md5)) count_m = count_m + 1 li_pos.append("\n") OUT.write("\t".join(li_pos)) else: continue else: continue count = count + 1 OUT.close() return str( count_mm ) + " Mismatches: Reference allele is effect allele" + "\n" + str( count_m) + " Matches to reference" + "\n" + str( count) + " Sites checked"
def translate(DNA): n = len(DNA) return ''.join([gcode.get(DNA[i:i + 3], '_') for i in range(0, n, 3)]) def main(f0, f1, f14): # Create one int dictionary per file. dict0 = defaultdict(int) dict1 = defaultdict(int) dict14 = defaultdict(int) for line in f0: DNA, count = line.split() dict0[DNA] = int(count) for line in f1: DNA, count = line.split() dict1[DNA] = int(count) for line in f14: DNA, count = line.split() dict14[DNA] = int(count) for DNA in set(dict0).union(dict1).union(dict14): print DNA, translate(DNA), dict0[DNA], dict1[DNA], dict14[DNA] if __name__ == "__main__": with gzopen(sys.argv[1]) as f0, gzopen(sys.argv[2]) as f1, \ gzopen(sys.argv[3]) as f14: main(f0, f1, f14)
def Rvtests_Quan(Sumstat, fasta): with gzopen(Sumstat) as stats: OUT = open(str(Sumstat.split("/")[-1] + ".Tidy"), "w") genome = pysam.Fastafile( fasta ) # load and index file only needs to be done once as it writes the index count = 0 # testing count_mm = 0 #mismatches count_m = 0 #matches for line in stats: if line.startswith("##") or line.startswith( "#G"): # rv tests has drivel at the start continue elif line.startswith("CHROM") or line.startswith( "#CHROM" ): # This part will be used to inform how the transformations happen based on what package is used. ie different headers identify different packages print check_tool(line) li_pos = line.strip("\n").split("\t") li_pos.append("MAF_PH") li_pos.append("MAC_PH") li_pos.append("EAC_PH") li_pos.append("EAF_PH") li_pos[BETA] = "BETA" li_pos.append("\n") OUT.write("\t".join(li_pos)) else: li_pos = line.strip("\n").split("\t") bp_pos = li_pos[1] #base position chr_n = li_pos[0] #chr number stat_ref = li_pos[2] # reference or non effect allele if ( (chr_n.isdigit() and int(chr_n) == 23) or chr_n == "X" ) and monomorphic_filter( li_pos ) is False: # for the X chromosome Currently using genotype counts to limit rows i.e if a variant is only homozygous for one allele the site is theoretically non informative, Can switch to using 1/2*number of samples this yields the theoretical MAF that can be observed given the data. chr_n = "X" gen_ref = genome.fetch( chr_n, int(bp_pos) - 1, int(bp_pos) ) ## query the reference genome using pysam. if stat_ref != gen_ref: # output statistics for switched Ref/Alt count_mm = count_mm + 1 li_pos[2] = gen_ref # swap references li_pos[3] = stat_ref # swap alternative li_pos[BETA] = float(li_pos[BETA]) * -1 # Update BETA li_pos.append(MAF(li_pos)) li_pos.append(MAC(li_pos)) li_pos[N_alt], li_pos[N_ref] = li_pos[N_ref], li_pos[ N_alt] li_pos.append(EAC(li_pos)) #update MAC li_pos.append(str( (1 - float(li_pos[AF])))) # Update EAF li_pos.append("\n") # require OUT.write( "\t".join(li_pos)) # write to new sumstat file elif stat_ref == gen_ref: # Update Statistics where Ref/Alt non switched li_pos.append(MAF(li_pos)) li_pos.append(MAC(li_pos)) li_pos.append(EAC(li_pos)) li_pos.append((str(float(li_pos[AF])))) li_pos[BETA] = str(float(li_pos[BETA])) #BETA->OR count_m = count_m + 1 li_pos.append("\n") OUT.write("\t".join(li_pos)) else: continue elif chr_n.isdigit( ) and int(chr_n) > 23: # for non autosomal & sex chromosomes continue elif (chr_n.isdigit( ) and int(chr_n) < 23) and monomorphic_filter( li_pos ) is False: # This is the body of the computational part all autosomes gen_ref = genome.fetch(chr_n, int(bp_pos) - 1, int(bp_pos)) # Query reg genome if stat_ref != gen_ref: count_mm = count_mm + 1 li_pos[2] = gen_ref # swap references li_pos[3] = stat_ref # swap alternatives li_pos[BETA] = str(float(li_pos[BETA]) * -1) # BETA li_pos.append(MAF(li_pos)) li_pos.append(MAC(li_pos)) li_pos[N_alt], li_pos[N_ref] = li_pos[N_ref], li_pos[ N_alt] li_pos.append(EAC(li_pos)) li_pos.append(str(1 - float(li_pos[AF]))) li_pos.append("\n") OUT.write("\t".join(li_pos)) elif stat_ref == gen_ref: li_pos.append(MAF(li_pos)) li_pos.append(MAC(li_pos)) li_pos.append(EAC(li_pos)) li_pos.append((str(float(li_pos[AF])))) li_pos[BETA] = str(float(li_pos[BETA])) #BETA->OR count_m = count_m + 1 li_pos.append("\n") OUT.write("\t".join(li_pos)) else: continue else: continue count = count + 1 OUT.close() return str( count_mm ) + " Reference mismatches: Reference allele is effect allele" + "\n" + str( count_m) + " Matches to reference genome" + "\n" + str( count) + " Sites checked"