def main(proc_num, queue, out_dir, in_dir): merged_index = ioutils.load_pickle(out_dir + "merged_index.pkl") print proc_num, "Start loop" while True: try: year = queue.get(block=False) except Empty: print proc_num, "Finished" break print proc_num, "Fixing counts for year", year fixed_counts = {} old_mat = matstore.retrieve_mat_as_dict(in_dir + str(year) + ".bin") old_index = ioutils.load_pickle(in_dir + str(year) + "-list.pkl") for pair, count in old_mat.iteritems(): try: i_word = old_index[pair[0]] except IndexError: print pair sys.exit(0) c_word = old_index[pair[1]] new_pair = (indexing.word_to_static_id(i_word, merged_index), indexing.word_to_static_id(c_word, merged_index)) fixed_counts[new_pair] = count print proc_num, "Writing counts for year", year matstore.export_mats_from_dicts({str(year) : fixed_counts}, out_dir)
def main(proc_num, queue, out_dir, in_dir): merged_index = ioutils.load_pickle(out_dir + "merged_index.pkl") print proc_num, "Start loop" while True: try: year = queue.get(block=False) except Empty: print proc_num, "Finished" break print proc_num, "Fixing counts for year", year fixed_counts = {} old_mat = matstore.retrieve_mat_as_dict(in_dir + str(year) + ".bin") old_index = ioutils.load_pickle(in_dir + str(year) + "-list.pkl") for pair, count in old_mat.iteritems(): try: i_word = old_index[pair[0]] except IndexError: print pair sys.exit(0) c_word = old_index[pair[1]] new_pair = (indexing.word_to_static_id(i_word, merged_index), indexing.word_to_static_id(c_word, merged_index)) fixed_counts[new_pair] = count print proc_num, "Writing counts for year", year matstore.export_mats_from_dicts({str(year): fixed_counts}, out_dir)
def main(proc_num, lock, download_dir, source): page = requests.get( "http://storage.googleapis.com/books/ngrams/books/datasetsv2.html") pattern = re.compile('href=\'(.*%s-%s-%s-.*\.csv.zip)' % (source, TYPE, VERSION)) urls = pattern.findall(page.text) del page print proc_num, "Start loop" while True: lock.acquire() work_left = False for url in urls: name = re.search('%s-(.*).csv.zip' % VERSION, url).group(1) dirs = set(os.listdir(download_dir)) if name in dirs: continue work_left = True print proc_num, "Name", name loc_dir = download_dir + "/" + name + "/" ioutils.mkdir(loc_dir) break lock.release() if not work_left: print proc_num, "Finished" break print proc_num, "Downloading", name success = False while not success: with open(loc_dir + name + '.csv.zip', 'w') as f: try: f.write(urllib2.urlopen(url, timeout=60).read()) success = True except: continue print proc_num, "Unzipping", name subprocess.call( ['unzip', '-o', loc_dir + name + '.csv.zip', '-d', loc_dir]) subprocess.call([ 'mv', loc_dir + 'googlebooks-' + source + '-' + TYPE + '-' + VERSION + '-' + name + '.csv', loc_dir + name ]) print proc_num, "Going through", name index = collections.OrderedDict() year_counters = collections.defaultdict(collections.Counter) n = 0 with open(loc_dir + name) as f: for l in f: split = l.strip().split('\t') try: ngram = split[0].split() middle_index = len(ngram) // 2 item = ngram[middle_index] context = ngram[:middle_index] + ngram[middle_index + 1:] item_id = indexing.word_to_id(item, index) year = split[1] count = int(split[2]) for context_word in context: pair = (item_id, indexing.word_to_id(context_word, index)) year_counters[year][pair] += count except: pass print proc_num, "Writing", name, n matstore.export_mats_from_dicts(year_counters, loc_dir) ioutils.write_pickle(index, loc_dir + "index.pkl") print proc_num, "Deleting", name try: os.remove(loc_dir + name) os.remove(loc_dir + name + '.csv.zip') except: pass
def main(proc_num, lock, page, download_dir, source): pattern = re.compile('href=\'(.*%s-%s-%s-.*\.gz)' % (source, TYPE, VERSION)) urls = pattern.findall(page.text) del page print proc_num, "Start loop" while True: lock.acquire() work_left = False for url in urls: if EXCLUDE_PATTERN.match(url): continue name = re.search('%s-(.*).gz' % VERSION, url).group(1) dirs = set(os.listdir(download_dir)) if name in dirs: continue work_left = True print proc_num, "Name", name loc_dir = download_dir + "/" + name + "/" ioutils.mkdir(loc_dir) break lock.release() if not work_left: print proc_num, "Finished" break print proc_num, "Downloading", name success = False while not success: with open(loc_dir + name + '.gz', 'w') as f: try: f.write(urllib2.urlopen(url, timeout=60).read()) success = True except: continue print proc_num, "Unzipping", name subprocess.call(['gunzip', '-f', loc_dir + name + '.gz', '-d']) # subprocess.call(['mv', loc_dir + 'googlebooks-' + source + '-' + TYPE + '-' + VERSION + '-' + name + '.csv', loc_dir + name]) print proc_num, "Going through", name index = collections.OrderedDict() year_counters = collections.defaultdict(collections.Counter) skipped = 0 with open(loc_dir + name) as f: for l in f: split = l.strip().split('\t') if EXCLUDE_PATTERN.match(split[0]): continue try: ngram = split[0].split() middle_index = len(ngram) // 2 item = ngram[middle_index] context = ngram[:middle_index] + ngram[middle_index + 1:] item_id = indexing.word_to_id(item, index) year = split[1] count = int(split[2]) for context_word in context: pair = (item_id, indexing.word_to_id(context_word, index)) year_counters[year][pair] += count except: skipped += 1 pass print proc_num, "Writing", name, "Skipped", skipped matstore.export_mats_from_dicts(year_counters, loc_dir) ioutils.write_pickle(index, loc_dir + "index.pkl") print proc_num, "Deleting", name try: os.remove(loc_dir + name) os.remove(loc_dir + name + '.gz') except: pass