def rewrite_corpus(corpus, vocab, outfname): outfname += '.corpus' num_tok = 0 with open(outfname, 'wb') as outf: with open(corpus) as corpusf: while True: lines = corpusf.readlines(10000000) # caching lines if not lines: break for line in lines: toks = line.split() for tok in toks: num_tok += 1 if tok in vocab: outf.write(tok+'\n') else: outf.write('<?>\n') if num_tok % 1000 is 0: inline_print('Processed %i tokens' % (num_tok)) inline_print('\n')
def rewrite_corpus(corpus, vocab, outfname): outfname += '.corpus' num_tok = 0 with open(outfname, 'wb') as outf: with open(corpus) as corpusf: while True: lines = corpusf.readlines(10000000) # caching lines if not lines: break for line in lines: toks = line.split() for tok in toks: num_tok += 1 if tok in vocab: outf.write(tok + '\n') else: outf.write('<?>\n') if num_tok % 1000 is 0: inline_print('Processed %i tokens' % (num_tok)) inline_print('\n')
def count_ngrams(corpus, n_vals=False): assert (os.path.isfile(corpus)) if n_vals == False: answer = raw_input('Type in the values of n (e.g., \"1 3\"): ') n_vals = [int(n) for n in answer.split()] num_tok = 0 ngrams = [Counter() for n in n_vals] queues = [deque([_buffer_ for _ in range(n - 1)], n) for n in n_vals] with open(corpus) as f: while True: lines = f.readlines(10000000) # caching lines if not lines: break for line in lines: toks = line.split() for tok in toks: num_tok += 1 if num_tok % 1000 is 0: inline_print('Processed %i tokens' % (num_tok)) for i in range(len(n_vals)): queues[i].append(tok) ngrams[i][tuple(queues[i])] += 1 for i in range(len(n_vals)): for _ in range(n_vals[i] - 1): queues[i].append(_buffer_) ngrams[i][tuple(queues[i])] += 1 say('\nTotal {} tokens'.format(num_tok)) files = [ os.path.splitext(corpus)[0] + '.' + str(n) + 'grams' for n in n_vals ] for i in range(len(n_vals)): say('Sorting {} {}grams and writing to: {}'.format( len(ngrams[i]), n_vals[i], files[i])) sorted_ngrams = sorted(ngrams[i].items(), key=lambda x: x[1], reverse=True) with open(files[i], 'wb') as outf: for ngram, count in sorted_ngrams: for tok in ngram: print >> outf, tok, print >> outf, count
def count_ngrams(corpus, n_vals=False): assert(os.path.isfile(corpus)) if n_vals == False: answer = raw_input('Type in the values of n (e.g., \"1 3\"): ') n_vals = [int(n) for n in answer.split()] num_tok = 0 ngrams = [Counter() for n in n_vals] queues = [deque([_buffer_ for _ in range(n-1)], n) for n in n_vals] with open(corpus) as f: while True: lines = f.readlines(10000000) # caching lines if not lines: break for line in lines: toks = line.split() for tok in toks: num_tok += 1 if num_tok % 1000 is 0: inline_print('Processed %i tokens' % (num_tok)) for i in range(len(n_vals)): queues[i].append(tok) ngrams[i][tuple(queues[i])] += 1 for i in range(len(n_vals)): for _ in range(n_vals[i]-1): queues[i].append(_buffer_) ngrams[i][tuple(queues[i])] += 1 say('\nTotal {} tokens'.format(num_tok)) files = [os.path.splitext(corpus)[0]+'.'+str(n)+'grams' for n in n_vals] for i in range(len(n_vals)): say('Sorting {} {}grams and writing to: {}'.format(len(ngrams[i]), n_vals[i], files[i])) sorted_ngrams = sorted(ngrams[i].items(), key=lambda x: x[1], reverse=True) with open(files[i], 'wb') as outf: for ngram, count in sorted_ngrams: for tok in ngram: print >> outf, tok, print >> outf, count
def get_stat(self, stat): self.stat = complete_path(stat) XYstats = self.stat + 'XY' Xstats = self.stat + 'X' Ystats = self.stat + 'Y' assert(os.path.isfile(XYstats) and os.path.isfile(Xstats) and os.path.isfile(Ystats)) say('XYstats: {}'.format(XYstats)) say('Xstats: {}'.format(Xstats)) say('Ystats: {}'.format(Ystats)) self.wordmap = {} wordmapf = self.stat + 'wordmap' with open(wordmapf) as f: for line in f: toks = line.split() self.wordmap[int(toks[0])-1] = toks[1] pickle_file = self.stat + 'pickle' if os.path.isfile(pickle_file): with open(pickle_file) as f: self.countXY, self.countX, self.countY, self.num_samples = \ cPickle.load(f) return self.countXY = Counter() self.countX = Counter() self.countY = Counter() self.num_samples = 0. num_lines = wc_l(XYstats) linenum = 0 with open(XYstats) as f: for line in f: linenum += 1 toks = line.split() x, y, count = int(toks[0])-1, int(toks[1])-1, int(toks[2]) self.countXY[x, y] = count if linenum % 1000 is 0: inline_print('Processing line %i of %i' % (linenum, num_lines)) with open(Xstats) as f: for line in f: toks = line.split() x, count = int(toks[0])-1, int(toks[1]) self.countX[x] = count self.num_samples += count with open(Ystats) as f: for line in f: toks = line.split() y, count = int(toks[0])-1, int(toks[1]) self.countY[y] = count inline_print('\nConstructing matrices\n') self.countXY = csc_matrix((self.countXY.values(), zip(*self.countXY.keys())), shape=(len(self.countX), len(self.countY))) self.countX = array([self.countX[i] for i in range(len(self.countX))]) self.countY = array([self.countY[i] for i in range(len(self.countY))]) with open(pickle_file, 'wb') as outf: cPickle.dump((self.countXY, self.countX, self.countY, self.num_samples), outf, protocol=cPickle.HIGHEST_PROTOCOL)
def extract_stat(corpus, vocab, stat, window, hash_width = 32): stat += '.window' + str(window) + '.hashbits' + str(hash_width) assert(os.path.isfile(corpus)) XYcount = Counter() Xcount = Counter() Ycount = Counter() CollisionCount = defaultdict(set) def inc_stats(q): center = (window - 1) / 2 # position of the current token if q[center] == _buffer_: return token = q[center] if q[center] in vocab else _rare_ Xcount[token] += 1 friend = '' for i in range(window): if i != center: if q[i] != _buffer_: friend += q[i] if q[i] in vocab else _rare_ rel_pos = i - center pos_marker = ('<+'+str(rel_pos)+'>' if rel_pos > 0 else '<'+str(rel_pos)+'>') friend += pos_marker friend_hashv = fnv_hash(friend, hash_width) CollisionCount[friend_hashv].add(friend) XYcount[(token, friend_hashv)] += 1 Ycount[friend_hashv] += 1 num_tok = 0 q = deque([_buffer_ for _ in range(window-1)], window) with open(corpus) as f: while True: lines = f.readlines(10000000) # caching lines if not lines: break for line in lines: toks = line.split() for tok in toks: num_tok += 1 if num_tok % 1000 is 0: inline_print('Processed %i tokens' % (num_tok)) q.append(tok) inc_stats(q) inline_print('\n') for _ in range(window-1): q.append(_buffer_) inc_stats(q) collisions = 0 for key, value in CollisionCount.iteritems(): if len(value) > 1: collisions += len(value) say('Collisions: {}'.format(collisions)) say('Creating directory {}'.format(stat)) if not os.path.exists(stat): os.makedirs(stat) xi, yi = {}, {} xhead, yhead = 1, 1 # starting from 1 for matlab with open(stat + '/X', 'wb') as Xfile: for token in Xcount: if not token in xi: xi[token] = xhead; xhead += 1 print >> Xfile, xi[token], Xcount[token] with open(stat + '/wordmap', 'wb') as wordmapfile: for token in xi: print >> wordmapfile, xi[token], token with open(stat + '/Y', 'wb') as Yfile: for friend in Ycount: if not friend in yi: yi[friend] = yhead; yhead += 1 print >> Yfile, yi[friend], Ycount[friend] with open(stat + '/XY', 'wb') as XYfile: for (token, friend) in XYcount: print >> XYfile, xi[token], yi[friend], XYcount[(token, friend)] return XYcount, Xcount, Ycount, stat
def extract_stat(corpus, vocab, stat, window): stat += '.window' + str(window) assert (os.path.isfile(corpus)) XYcount = Counter() Xcount = Counter() Ycount = Counter() def inc_stats(q): center = (window - 1) / 2 # position of the current token if q[center] == _buffer_: return token = q[center] if q[center] in vocab else _rare_ Xcount[token] += 1 for i in range(window): if i != center: if q[i] == _buffer_: continue friend = q[i] if q[i] in vocab else _rare_ rel_position = i - center position_marker = '<+' + str( rel_position) + '>' if rel_position > 0 else '<' + str( rel_position) + '>' friend += position_marker XYcount[(token, friend)] += 1 Ycount[friend] += 1 num_tok = 0 q = deque([_buffer_ for _ in range(window - 1)], window) with open(corpus) as f: while True: lines = f.readlines(10000000) # caching lines if not lines: break for line in lines: toks = line.split() for tok in toks: num_tok += 1 if num_tok % 1000 is 0: inline_print('Processed %i tokens' % (num_tok)) q.append(tok) inc_stats(q) inline_print('\n') for _ in range(window - 1): q.append(_buffer_) inc_stats(q) say('Creating directory {}'.format(stat)) if not os.path.exists(stat): os.makedirs(stat) xi, yi = {}, {} xhead, yhead = 1, 1 # starting from 1 for matlab with open(stat + '/X', 'wb') as Xfile: for token in Xcount: if not token in xi: xi[token] = xhead xhead += 1 print >> Xfile, xi[token], Xcount[token] with open(stat + '/wordmap', 'wb') as wordmapfile: for token in xi: print >> wordmapfile, xi[token], token with open(stat + '/Y', 'wb') as Yfile: for friend in Ycount: if not friend in yi: yi[friend] = yhead yhead += 1 print >> Yfile, yi[friend], Ycount[friend] with open(stat + '/XY', 'wb') as XYfile: for (token, friend) in XYcount: print >> XYfile, xi[token], yi[friend], XYcount[(token, friend)] return XYcount, Xcount, Ycount, stat
def extract_stat(corpus, vocab, stat, window): stat += '.window' + str(window) assert(os.path.isfile(corpus)) XYcount = Counter() Xcount = Counter() Ycount = Counter() def inc_stats(q): center = (window - 1) / 2 # position of the current token if q[center] == _buffer_: return token = q[center] if q[center] in vocab else _rare_ Xcount[token] += 1 for i in range(window): if i != center: if q[i] == _buffer_: continue friend = q[i] if q[i] in vocab else _rare_ rel_position = i-center position_marker = '<+'+str(rel_position)+'>' if rel_position > 0 else '<'+str(rel_position)+'>' friend += position_marker XYcount[(token, friend)] += 1 Ycount[friend] += 1 num_tok = 0 q = deque([_buffer_ for _ in range(window-1)], window) with open(corpus) as f: while True: lines = f.readlines(10000000) # caching lines if not lines: break for line in lines: toks = line.split() for tok in toks: num_tok += 1 if num_tok % 1000 is 0: inline_print('Processed %i tokens' % (num_tok)) q.append(tok) inc_stats(q) inline_print('\n') for _ in range(window-1): q.append(_buffer_) inc_stats(q) say('Creating directory {}'.format(stat)) if not os.path.exists(stat): os.makedirs(stat) xi, yi = {}, {} xhead, yhead = 1, 1 # starting from 1 for matlab with open(stat + '/X', 'wb') as Xfile: for token in Xcount: if not token in xi: xi[token] = xhead; xhead += 1 print >> Xfile, xi[token], Xcount[token] with open(stat + '/wordmap', 'wb') as wordmapfile: for token in xi: print >> wordmapfile, xi[token], token with open(stat + '/Y', 'wb') as Yfile: for friend in Ycount: if not friend in yi: yi[friend] = yhead; yhead += 1 print >> Yfile, yi[friend], Ycount[friend] with open(stat + '/XY', 'wb') as XYfile: for (token, friend) in XYcount: print >> XYfile, xi[token], yi[friend], XYcount[(token, friend)] return XYcount, Xcount, Ycount, stat