def FastqIterator(files,raw=False,titleSet=None): """return an iterator of Records found in file handle, fh. if records are not needed raw can be set to True, and then you can get (titleStr, seqStr, qualityStr). With raw output, the sequence and quality strings have the newlines still in them. """ def readTotitle(fh, titleChar): """returns a tuple ([lines before the next title line], next tile line) """ preLines = [] while True: l = fh.readline() if l.startswith(titleChar): return (preLines,l) elif l == '': return preLines,None else: preLines.append(l) for fh in fileIterator(files): preLines,nextTitleLine =readTotitle(fh,'@') while nextTitleLine != None: seqTitle = nextTitleLine[1:].rstrip() preLines,nextTitleLine=readTotitle(fh,'+') qualTitle = nextTitleLine[1:].rstrip() if len(qualTitle.strip()) > 0 and seqTitle != qualTitle: raise FastqParseError, ("Error in parsing: @title sequence entry must be immediately " "followed by corresponding +title quality entry.") seqLines = preLines qualLines = [] for i in range(len(seqLines)): # Quality characters should be the same length as the sequence qualLines.append( fh.readline() ) preLines,nextTitleLine=readTotitle(fh,'@') if titleSet!= None and seqTitle not in titleSetSet: continue seqLines = map(lambda x: x.strip(), seqLines) qualLines = map(lambda x: x.strip(), qualLines) if raw: yield (seqTitle, ''.join(seqLines), ''.join(qualLines)) else: rec=Record() rec.title=seqTitle rec.sequence=''.join(seqLines) rec.quality=flatten(map(lambda x: qualToInt(x),qualLines)) yield rec
def phdQualIterator(fastaFiles, qualFiles, raw=False): def readTotitle(fh): """returns a tuple ([lines before the next title line], next tile line) """ preLines = [] while True: l = fh.readline() if l.startswith('>'): return (preLines,l) elif l == '': return preLines,None else: preLines.append(l) def qualityIterator(filename): fh = file(filename) preLines, nextTitleLine = readTotitle(fh) while nextTitleLine != None: title = nextTitleLine[1:].rstrip() preLines, nextTitleLine = readTotitle(fh) yield (title, ' '.join(preLines)) qualFiles = getIteratable(qualFiles) for idx, fastaFh in enumerate(fileIterator(fastaFiles)): qualIter = qualityIterator(qualFiles[idx]) for seqTitle, sequence in fasta.FastaIterator(fastaFh, raw=True): qTitle, qualities = qualIter.next() sequence = sequence.replace('\n', '') qualities = qualities.replace('\n', '') if raw: yield (seqTitle, ''.join(sequence), ''.join(qualities)) else: qualities = qualities.split() if len(sequence) != len(qualities): raise Exception, 'Invalid number of qualities' rec = Record() rec.title = seqTitle rec.sequence = sequence rec.quality = qualities yield rec
def makematrix(**kwargs): """ ? """ corpusfile = kwargs["corpusfile"] outfile = kwargs["outfile"] c_length = kwargs["c_length"] threshold = kwargs["threshold"] num_shards = kwargs["num_shards"] vocfile = kwargs["vocfile"] # Generate Counts of each distinct word print "Generating Wordcounts..." iterator = utils.fileIterator(corpusfile, num_shards) counts = defaultdict(int) num_sentences = 0 vocab = set() for sentence in iterator: num_sentences += 1 for word in sentence: counts[word] += 1.0 print "Counts generated from %d sentences." % num_sentences print "Initializing Adj Matrix..." # Generate Vocabulary Mapping. Disregard uncommon words. vocab = {word: idx for idx, word in enumerate([word for word, count in counts.iteritems() if count > threshold])} vocab_size = len(vocab.keys()) # Initialize Adj matrix adj = [defaultdict(int) for _ in range(vocab_size)] print "Matrix initialized with vocab size of {0}.".format(vocab_size) # Generate Contexts for every Sentence in Corpus iterator = utils.fileIterator(corpusfile, num_shards) print "Starting Matrix Computation..." sentence_count = 0 for sentence in iterator: sentence_count += 1 if sentence_count % (num_sentences / 100) == 0: sys.stderr.write("Progress: {0}%\r".format((100 * sentence_count) / num_sentences)) # Generate Contexts for each word in sentence for pos in range(0, len(sentence)): word = sentence[pos] # Disregard uncommon Words if word not in vocab: continue word = vocab[word] # For each context for offset in (-c_length, c_length): # Out Of Range if offset == 0 or pos + offset < 0 or pos + offset >= len(sentence): continue context = sentence[pos + offset] # Disregard Uncommon Contexts if context not in vocab: continue context = vocab[context] adj[word][context] += 1.0 # Normalize Context Counts for word in range(0, vocab_size): total_counts = sum(adj[word].values()) adj[word] = {context: count / total_counts for context, count in adj[word].iteritems()} pickle.dump(adj, open(outfile, "w")) pickle.dump(vocab, open(vocfile, "w")) return