def main(): parser = argparse.ArgumentParser() parser.add_argument('--total-jobs', metavar='<total-jobs>', help='total number of jobs downloading documents', type=int) parser.add_argument('--job', metavar='<job>', help='job number between 1 and <total-jobs>', type=int) args = parser.parse_args() check_args(parser, args) br = Browser() br.set_handle_robots(False) # br.set_debug_responses(True) data = urlencode({'user': USERNAME, 'pass': getpass()}) document_urls = [LOGIN_PREFIX + url.strip() + '&view=etext' for url in file(DOCUMENT_URLS_FILE)] start = args.job - 1 step = args.total_jobs for url in iterview(document_urls[start::step]): try: get_document_pages(br, url, data) except Exception as e: print >> sys.stderr, '\n', (url, e)
def check(func, dist): """ Arguments: func -- function to check dist -- (unnormalized) distribution to pass to func """ num_samples = 100000 empirical = zeros(len(dist)) for n in iterview(xrange(num_samples)): empirical[func(dist)] += 1 empirical /= num_samples normalized_dist = dist / float(dist.sum()) # could look at max relative error # could also look at JS or KL divergence # could do absolute difference # ... error = (abs(empirical - normalized_dist) / normalized_dist).mean() assert error < 0.01, 'Mean relative error >= 1%'
def main(): parser = argparse.ArgumentParser() parser.add_argument('--total-jobs', metavar='<total-jobs>', help='total number of jobs downloading documents', type=int) parser.add_argument('--job', metavar='<job>', help='job number between 1 and <total-jobs>', type=int) args = parser.parse_args() check_args(parser, args) br = Browser() br.set_handle_robots(False) # br.set_debug_responses(True) data = urlencode({'user': USERNAME, 'pass': getpass()}) document_urls = [ LOGIN_PREFIX + url.strip() + '&view=etext' for url in file(DOCUMENT_URLS_FILE) ] start = args.job - 1 step = args.total_jobs for url in iterview(document_urls[start::step]): try: get_document_pages(br, url, data) except Exception as e: print >> sys.stderr, '\n', (url, e)
def log_predictive_prob(self, new_corpus, num_samples): D, V, T = self.D, self.V, self.T Nvt_plus_beta_n = self.Nvt_plus_beta_n Nt_plus_beta = self.Nt_plus_beta Dt_plus_alpha_m = self.Dt_plus_alpha_m D_plus_alpha = self.D_plus_alpha Nvt_new, Nt_new, Dt_new, z_new = [], [], [], [] for r in xrange(num_samples): Nvt_new.append(zeros((T, V), dtype=int)) Nt_new.append(zeros(T, dtype=int)) Dt_new.append(zeros(T, dtype=int)) z_new.append(zeros(len(new_corpus), dtype=int)) log_p = 0 for d, doc in enumerate(iterview(new_corpus)): tmp = zeros(num_samples, dtype=float) for r in xrange(num_samples): for prev_d in xrange(0, d): prev_doc = corpus.documents[prev_d] t = z_new[r][prev_d] Nvt_new[r][t, :] -= prev_doc.Nv Nt_new[r][t] -= len(prev_doc) Dt_new[r][t] -= 1 t = log_sample(gammaln(Nt_new[r] + Nt_plus_beta) - gammaln(Nvt_new[r] + Nvt_plus_beta_n).sum(axis=1) + gammaln(tile(prev_doc.Nv, (T, 1)) + Nvt_new[r] + Nvt_plus_beta_n).sum(axis=1) - gammaln(len(prev_doc) * ones(T) + Nt_new[r] + Nt_plus_beta) + log(Dt_new[r] + Dt_plus_alpha_m)) Nvt_new[r][t, :] += prev_doc.Nv Nt_new[r][t] += len(prev_doc) Dt_new[r][t] += 1 z_new[r][prev_d] = t log_dist = gammaln(Nt_new[r] + Nt_plus_beta) - gammaln(Nvt_new[r] + Nvt_plus_beta_n).sum(axis=1) + gammaln(tile(doc.Nv, (T, 1)) + Nvt_new[r] + Nvt_plus_beta_n).sum(axis=1) - gammaln(len(doc) * ones(T) + Nt_new[r] + Nt_plus_beta) + log(Dt_new[r] + Dt_plus_alpha_m) - log(d + D_plus_alpha) tmp[r] = log_sum_exp(log_dist) t = log_sample(log_dist) Nvt_new[r][t, :] += doc.Nv Nt_new[r][t] += len(doc) Dt_new[r][t] += 1 z_new[r][d] = t log_p += log_sum_exp(tmp) - log(num_samples) return log_p
def time_taken(func, corpus, alpha, m, beta, n, num_reps): avg = 0 for rep in iterview(xrange(num_reps), inc=1): start = time.time() func(corpus, alpha, m, beta, n) avg += (time.time() - start) avg /= float(num_reps) return avg
def gibbs_iteration(self, init=False): """ Uses Gibbs sampling to draw a single sample from the posterior distribution over document--component assignments (i.e., document groups) given this instance's corpus (i.e., document tokens). By default (i.e., if keyword argument 'init' is set to the value 'False') all document--component assignments (and corresponding counts) are assumed to have been initialized previously; otherwise, they are initialized. Keyword arguments: init -- whether to initialize document--component assignments """ corpus = self.corpus T = self.T alpha_m = self.alpha_m Nvt_plus_beta_n = self.Nvt_plus_beta_n Nt_plus_beta = self.Nt_plus_beta Dt = self.Dt z = self.z for d, (doc, t) in enumerate(iterview(zip(corpus, z))): if not init: Nvt_plus_beta_n[t, :] -= doc.Nv Nt_plus_beta[t] -= len(doc) Dt[t] -= 1 t = log_sample( gammaln(Nt_plus_beta) - gammaln(Nvt_plus_beta_n).sum(axis=1) + gammaln(tile(doc.Nv, (T, 1)) + Nvt_plus_beta_n).sum(axis=1) - gammaln(len(doc) * ones(T) + Nt_plus_beta) + log(Dt + alpha_m) ) Nvt_plus_beta_n[t, :] += doc.Nv Nt_plus_beta[t] += len(doc) Dt[t] += 1 z[d] = t
def main(): br = Browser() br.set_handle_robots(False) # br.set_debug_responses(True) data = urlencode({'user': USERNAME, 'pass': getpass()}) classifications = get_metadata_options(br, 'ca', data) sources = get_metadata_options(br, 'is', data) makedir(METADATA_DIR) makedir(TEXT_DIR) for filename in iterview(glob(DOCUMENT_PAGES_DIR + '/*'), inc=1000): extract_data(filename, classifications, sources)
def gibbs_iteration(self, init=False): """ Uses Gibbs sampling to draw a single sample from the posterior distribution over document--component assignments (i.e., document groups) given this instance's corpus (i.e., document tokens). By default (i.e., if keyword argument 'init' is set to the value 'False') all document--component assignments (and corresponding counts) are assumed to have been initialized previously; otherwise, they are initialized. Keyword arguments: init -- whether to initialize document--component assignments """ corpus = self.corpus T = self.T Nvt_plus_beta_n = self.Nvt_plus_beta_n Nt_plus_beta = self.Nt_plus_beta Dt_plus_alpha_m = self.Dt_plus_alpha_m z = self.z for d, (doc, t) in enumerate(iterview(zip(corpus, z))): if not init: Nvt_plus_beta_n[t, :] -= doc.Nv Nt_plus_beta[t] -= len(doc) Dt_plus_alpha_m[t] -= 1 t = log_sample( gammaln(Nt_plus_beta) - gammaln(Nvt_plus_beta_n).sum(axis=1) + gammaln(tile(doc.Nv, (T, 1)) + Nvt_plus_beta_n).sum(axis=1) - gammaln(len(doc) * ones(T) + Nt_plus_beta) + log(Dt_plus_alpha_m)) Nvt_plus_beta_n[t, :] += doc.Nv Nt_plus_beta[t] += len(doc) Dt_plus_alpha_m[t] += 1 z[d] = t
def list_all_duplicates(directory): target = Path(directory) print('[*] finding all files...') filenames = target.glob('**/*') files = [f for f in filenames] files = sorted(files, key=lambda x: x.stat().st_size) files = list(filter(lambda x: x.stat().st_size > MIN_SIZE, files)) hashes = {} first = None cur_size = -1 for file in iterview(files): if file.is_dir(): continue size = file.stat().st_size if cur_size == size: # if two same size files are found, record first one, if first is not None: with first.open('rb') as f: hashval = md5(f.read()).hexdigest() hashes[hashval] = [first] first = None with file.open('rb') as f: hashval = md5(f.read()).hexdigest() if hashes.get(hashval): hashes[hashval].append(file) else: hashes[hashval] = [file] else: first = file cur_size = size dup_files_list = [] for paths in hashes.values(): if len(paths) > 1: dup_files_list.append(paths) return dup_files_list
def gibbs_iteration(self, init=False): """ Uses Gibbs sampling to draw a single sample from the posterior distribution over token--component (i.e., token--topic) assignments given this instance's corpus (i.e., document tokens). By default (i.e., if keyword argument 'init' is set to the value 'False') all token--component assignments (and corresponding counts) are assumed to have been initialized previously; otherwise, they are initialized. Keyword arguments: init -- whether to initialize token--component assignments """ corpus = self.corpus Nvt_plus_beta_n = self.Nvt_plus_beta_n Nt_plus_beta = self.Nt_plus_beta Ntd_plus_alpha_m = self.Ntd_plus_alpha_m Nd_plus_alpha = self.Nd_plus_alpha z = self.z for d, (doc, zd) in enumerate(iterview(zip(corpus, z), inc=200)): for n, (v, t) in enumerate(zip(doc.w, zd)): if not init: Nvt_plus_beta_n[v, t] -= 1 Nt_plus_beta[t] -= 1 Ntd_plus_alpha_m[d, t] -= 1 t = sample((Nvt_plus_beta_n[v, :] / Nt_plus_beta) * Ntd_plus_alpha_m[d, :]) Nvt_plus_beta_n[v, t] += 1 Nt_plus_beta[t] += 1 Ntd_plus_alpha_m[d, t] += 1 if init: Nd_plus_alpha[d] += 1 zd[n] = t
def get_listing_pages(br): """ Caches the contents of each URL in the file whose name is stored in the variable LISTING_URLS_FILE to the directory whose name is stored on the variable LISTING_PAGES_DIR. The contents of each URL will be stored in a file whose name is that URL's md5 hash. Arguments: br -- Browser object """ listing_urls = [url.strip() for url in file(LISTING_URLS_FILE)] for url in iterview(listing_urls): try: download_url(br, url, LISTING_PAGES_DIR) except Exception as e: print >> sys.stderr, '\n', (url, e)
def group_hashes(hashes): debug('Start grouping hashes...') g = groups.Groups(hashes.keys()) for k1, v1 in iterview(hashes.items()): img_id = g.find(str(k1)) for k2, v2 in hashes.items(): if k1 == k2: continue if v1 - v2 <= HASH_THRESHOLD: g.unite(img_id, str(k2)) # groups with only one element are filtered group_result = g.get() cnt = Counter(group_result.values()) filtered = [item for item in group_result.items()] filtered = filter(lambda g: True if cnt[g[1]] > 1 else False, filtered) return filtered
def gibbs_iteration(self, init=False): """ Uses Gibbs sampling to draw a single sample from the posterior distribution over token--component (i.e., token--topic) assignments given this instance's corpus (i.e., document tokens). By default (i.e., if keyword argument 'init' is set to the value 'False') all token--component assignments (and corresponding counts) are assumed to have been initialized previously; otherwise, they are initialized. Keyword arguments: init -- whether to initialize token--component assignments """ corpus = self.corpus Nvt_plus_beta_n = self.Nvt_plus_beta_n Nt_plus_beta = self.Nt_plus_beta Ntd_plus_alpha_m = self.Ntd_plus_alpha_m Nd_plus_alpha = self.Nd_plus_alpha z = self.z for d, (doc, zd) in enumerate(iterview(zip(corpus, z), inc=200)): for n, (v, t) in enumerate(zip(doc.w, zd)): if not init: Nvt_plus_beta_n[v, t] -= 1 Nt_plus_beta[t] -= 1 Ntd_plus_alpha_m[d, t] -= 1 pass # YOUR CODE GOES HERE Nvt_plus_beta_n[v, t] += 1 Nt_plus_beta[t] += 1 Ntd_plus_alpha_m[d, t] +=1 if init: Nd_plus_alpha[d] += 1 zd[n] = t
def gibbs_iteration(self, init=False): """ Uses Gibbs sampling to draw a single sample from the posterior distribution over document--component assignments (i.e., document groups) given this instance's corpus (i.e., document tokens). By default (i.e., if keyword argument 'init' is set to the value 'False') all document--component assignments (and corresponding counts) are assumed to have been initialized previously; otherwise, they are initialized. Keyword arguments: init -- whether to initialize document--component assignments """ corpus = self.corpus T = self.T alpha_m = self.alpha_m Nvt_plus_beta_n = self.Nvt_plus_beta_n Nt_plus_beta = self.Nt_plus_beta Dt = self.Dt z = self.z for d, (doc, t) in enumerate(iterview(zip(corpus, z))): if not init: Nvt_plus_beta_n[t, :] -= doc.Nv Nt_plus_beta[t] -= len(doc) Dt[t] -= 1 pass # YOUR CODE GOES HERE Nvt_plus_beta_n[t, :] += doc.Nv Nt_plus_beta[t] += len(doc) Dt[t] += 1 z[d] = t
def get_listing_data(): with safe_write(CSV_FILE) as f: for filename in iterview(glob(LISTING_PAGES_DIR + '/*')): contents = file(filename).read() # print contents try: [obj] = re.findall('dataLayer\s*=\s*\[(.*)\];', contents) obj = loads(obj) except ValueError: return if 'listPrice' in obj and 'listBed' in obj: text = '\t'.join((os.path.basename(filename), str(obj['listPrice']), str(obj['listBed']))) f.write(text) f.write('\n') f.flush()
def create_csv(numbers, max_document_length, min_type_count, stopwords): vocab = defaultdict(int) data = {} for filename in iterview(glob(METADATA_DIR + '/*')): number = os.path.basename(filename) if numbers is not None and number not in numbers: continue with file(filename) as f: metadata = f.read().strip() fields = metadata.split('\t') assert len(fields) == 8 assert fields[0] == number text = '' for page in xrange(1, int(metadata[-1]) + 1): with file(os.path.join(TEXT_DIR, number + '_' + str(page))) as f: text += f.read().strip() text = re.findall('[a-z]+', text) text = [x for x in text if x not in stopwords] for x in text: vocab[x] += 1 data[number] = (metadata, ' '.join(text)) for number, (metadata, text) in data.items(): text = [x for x in text.split(' ') if vocab[x] >= min_type_count] text = text[:min(len(text), max_document_length)] print '\t'.join([metadata, ' '.join(text)])
def create_csv(numbers, max_document_length, min_type_count, stopwords): vocab = defaultdict(int) data = {} for filename in iterview(glob(METADATA_DIR + "/*")): number = os.path.basename(filename) if numbers is not None and number not in numbers: continue with file(filename) as f: metadata = f.read().strip() fields = metadata.split("\t") assert len(fields) == 8 assert fields[0] == number text = "" for page in xrange(1, int(metadata[-1]) + 1): with file(os.path.join(TEXT_DIR, number + "_" + str(page))) as f: text += f.read().strip() text = re.findall("[a-z]+", text) text = [x for x in text if x not in stopwords] for x in text: vocab[x] += 1 data[number] = (metadata, " ".join(text)) for number, (metadata, text) in data.items(): text = [x for x in text.split(" ") if vocab[x] >= min_type_count] text = text[: min(len(text), max_document_length)] print "\t".join([metadata, " ".join(text)])
def time_taken(func, dists, num_reps, num_samples=1): seed(1000) mean = 0 for rep in iterview(xrange(num_reps)): start = time.time() for dist in dists: if num_samples == 1: func(dist) else: func(dist, num_samples) mean += (time.time() - start) / float(len(dists)) mean /= float(num_reps) return mean
def gibbs_iteration(self, init=False): """ Uses Gibbs sampling to draw a single sample from the posterior distribution over token--topic assignments. Keyword arguments: init -- whether to initialize token--topic assignments """ corpus = self.corpus Ntd_plus_alpha_m = self.Ntd_plus_alpha_m Nd_plus_alpha = self.Nd_plus_alpha Nvt_plus_beta_n = self.Nvt_plus_beta_n Nt_plus_beta = self.Nt_plus_beta z = self.z for d, (doc, zd) in enumerate(iterview(zip(corpus, z), inc=200)): for n, (v, t) in enumerate(zip(doc.tokens, zd)): if not init: Ntd_plus_alpha_m[d, t] -= 1 Nvt_plus_beta_n[v, t] -= 1 Nt_plus_beta[t] -= 1 else: Nd_plus_alpha[d] += 1 t = sample((Nvt_plus_beta_n[v, :] / Nt_plus_beta) * Ntd_plus_alpha_m[d, :]) Ntd_plus_alpha_m[d, t] +=1 Nvt_plus_beta_n[v, t] += 1 Nt_plus_beta[t] += 1 zd[n] = t
def gibbs_iteration(self, init=False): """ Uses Gibbs sampling to draw a single sample from the posterior distribution over token--topic assignments. Keyword arguments: init -- whether to initialize token--topic assignments """ corpus = self.corpus Ntd_plus_alpha_m = self.Ntd_plus_alpha_m Nd_plus_alpha = self.Nd_plus_alpha Nvt_plus_beta_n = self.Nvt_plus_beta_n Nt_plus_beta = self.Nt_plus_beta z = self.z for d, (doc, zd) in enumerate(iterview(zip(corpus, z), inc=200)): for n, (v, t) in enumerate(zip(doc.tokens, zd)): if not init: Ntd_plus_alpha_m[d, t] -= 1 Nvt_plus_beta_n[v, t] -= 1 Nt_plus_beta[t] -= 1 else: Nd_plus_alpha[d] += 1 t = sample((Nvt_plus_beta_n[v, :] / Nt_plus_beta) * Ntd_plus_alpha_m[d, :]) Ntd_plus_alpha_m[d, t] += 1 Nvt_plus_beta_n[v, t] += 1 Nt_plus_beta[t] += 1 zd[n] = t
def log_predictive_prob(self, new_corpus, num_samples): D, V, T = self.D, self.V, self.T Nvt_plus_beta_n = self.Nvt_plus_beta_n Nt_plus_beta = self.Nt_plus_beta Dt_plus_alpha_m = self.Dt_plus_alpha_m D_plus_alpha = self.D_plus_alpha Nvt_new, Nt_new, Dt_new, z_new = [], [], [], [] for r in xrange(num_samples): Nvt_new.append(zeros((T, V), dtype=int)) Nt_new.append(zeros(T, dtype=int)) Dt_new.append(zeros(T, dtype=int)) z_new.append(zeros(len(new_corpus), dtype=int)) log_p = 0 for d, doc in enumerate(iterview(new_corpus)): tmp = zeros(num_samples, dtype=float) for r in xrange(num_samples): for prev_d in xrange(0, d): prev_doc = corpus.documents[prev_d] t = z_new[r][prev_d] Nvt_new[r][t, :] -= prev_doc.Nv Nt_new[r][t] -= len(prev_doc) Dt_new[r][t] -= 1 t = log_sample( gammaln(Nt_new[r] + Nt_plus_beta) - gammaln(Nvt_new[r] + Nvt_plus_beta_n).sum(axis=1) + gammaln( tile(prev_doc.Nv, (T, 1)) + Nvt_new[r] + Nvt_plus_beta_n).sum(axis=1) - gammaln( len(prev_doc) * ones(T) + Nt_new[r] + Nt_plus_beta) + log(Dt_new[r] + Dt_plus_alpha_m)) Nvt_new[r][t, :] += prev_doc.Nv Nt_new[r][t] += len(prev_doc) Dt_new[r][t] += 1 z_new[r][prev_d] = t pass # YOUR CODE GOES HERE Nvt_new[r][t, :] += doc.Nv Nt_new[r][t] += len(doc) Dt_new[r][t] += 1 z_new[r][d] = t log_p += log_sum_exp(tmp) - log(num_samples) return log_p
from glob import glob import os from iterview import iterview for filename in iterview(glob('data/cache/html/*/*'), inc=1000): try: with file(filename) as f: contents = f.read() assert contents assert '<title>Off-Campus' not in contents except AssertionError: print 'Removing ', filename os.remove(filename)
def log_predictive_prob(self, new_corpus, num_samples): """ Returns an approximation of the log probability of the specified new corpus given this instance's corpus (i.e., document tokens) AND current set of token--component (i.e., token--topic) assignments according to LDA. Arguments: new_corpus -- new corpus of documents num_samples -- ... """ V, T = self.V, self.T D_new = len(new_corpus) alpha, alpha_m = self.alpha, self.alpha_m Nvt_plus_beta_n = self.Nvt_plus_beta_n Nt_plus_beta = self.Nt_plus_beta Nvt_new, Nt_new, Ntd_new, z_new = [], [], [], [] for r in xrange(num_samples): Nvt_new.append(zeros((V, T), dtype=int)) Nt_new.append(zeros(T, dtype=int)) Ntd_new.append(zeros((D_new, T), dtype=int)) z_r = [] for doc in new_corpus: z_r.append(zeros(len(doc), dtype=int)) z_new.append(z_r) log_p = 0 for d, doc in enumerate(iterview(new_corpus)): for n, v in enumerate(doc.w): tmp = zeros(num_samples, dtype=float) for r in xrange(num_samples): # for efficiency, resample only those # token--component assignments belonging to # previous tokens in the current document for prev_n in xrange(0, n): prev_v = doc.w[prev_n] t = z_new[r][d][prev_n] Nvt_new[r][prev_v, t] -= 1 Nt_new[r][t] -= 1 Ntd_new[r][d, t] -= 1 t = sample((Nvt_new[r][prev_v, :] + Nvt_plus_beta_n[prev_v, :]) / (Nt_new[r] + Nt_plus_beta) * (Ntd_new[r][d, :] + alpha_m)) Nvt_new[r][prev_v, t] += 1 Nt_new[r][t] += 1 Ntd_new[r][d, t] += 1 z_new[r][d][prev_n] = t dist = ((Nvt_new[r][v, :] + Nvt_plus_beta_n[v, :]) / (Nt_new[r] + Nt_plus_beta)) * ( (Ntd_new[r][d, :] + alpha_m) / (n + alpha)) tmp[r] = log(dist.sum()) t = sample(dist) Nvt_new[r][v, t] += 1 Nt_new[r][t] += 1 Ntd_new[r][d, t] += 1 z_new[r][d][n] = t log_p += log_sum_exp(tmp) - log(num_samples) return log_p
def log_predictive_prob(self, new_corpus, num_samples): """ Returns an approximation of the log probability of the specified new corpus given this instance's corpus (i.e., document tokens) AND current set of token--component (i.e., token--topic) assignments according to LDA. Arguments: new_corpus -- new corpus of documents num_samples -- ... """ V, T = self.V, self.T D_new = len(new_corpus) alpha, alpha_m = self.alpha, self.alpha_m Nvt_plus_beta_n = self.Nvt_plus_beta_n Nt_plus_beta = self.Nt_plus_beta Nvt_new, Nt_new, Ntd_new, z_new = [], [], [], [] for r in xrange(num_samples): Nvt_new.append(zeros((V, T), dtype=int)) Nt_new.append(zeros(T, dtype=int)) Ntd_new.append(zeros((D_new, T), dtype=int)) z_r = [] for doc in new_corpus: z_r.append(zeros(len(doc), dtype=int)) z_new.append(z_r) log_p = 0 for d, doc in enumerate(iterview(new_corpus)): for n, v in enumerate(doc.w): tmp = zeros(num_samples, dtype=float) for r in xrange(num_samples): # for efficiency, resample only those # token--component assignments belonging to # previous tokens in the current document for prev_n in xrange(0, n): prev_v = doc.w[prev_n] t = z_new[r][d][prev_n] Nvt_new[r][prev_v, t] -= 1 Nt_new[r][t] -= 1 Ntd_new[r][d, t] -= 1 t = sample((Nvt_new[r][prev_v, :] + Nvt_plus_beta_n[prev_v, :]) / (Nt_new[r] + Nt_plus_beta) * (Ntd_new[r][d, :] + alpha_m)) Nvt_new[r][prev_v, t] += 1 Nt_new[r][t] += 1 Ntd_new[r][d, t] += 1 z_new[r][d][prev_n] = t pass # YOUR CODE GOES HERE Nvt_new[r][v, t] += 1 Nt_new[r][t] += 1 Ntd_new[r][d, t] += 1 z_new[r][d][n] = t log_p += log_sum_exp(tmp) - log(num_samples) return log_p