def load_unigrams(my_index): punct = punctuation + '«»—…“”*–' russian = "[А-Яа-я]+" tags = ("_ADJ", "_ADP", "_ADV", "_CONJ", "_NOUN", "_NUM", "_PRT", "_VERB", "_X") fname, url, records = next( readline_google_store(ngram_len=1, lang='rus', indices=my_index)) record = next(records) count = 0 with open('unigrams_' + my_index + '.tsv', 'w') as f: writer = csv.writer(f, delimiter='\t') while True: try: if record.year < 1918: record = next(records) else: if len(record.ngram.strip(punct)) > 2 \ and re.search(russian, record.ngram) \ and not record.ngram.endswith(tags): writer.writerow([ record.ngram, record.year, record.match_count, record.volume_count ]) count += 1 record = next(records) else: record = next(records) except StopIteration: break print(str(count) + " " + my_index + " ngrams saved") return 0
def getNgrams(self): """Get Frequency of Words in Google Ngram Corpus.""" keys = self.Ngrams.keys() alphabet = list(map(chr, range(97, 123))) count = 0 current = '' for char in alphabet: googleGen = readline_google_store(ngram_len=1, indices=char) while googleGen: try: name, url, wordGen = next(googleGen) while wordGen: try: token, year, match, volume = next(wordGen) if token in keys: if token == current: count += match else: self.Ngrams[current] = count current = token count = 0 else: continue except StopIteration: break except StopIteration: break print("Finished with" + "\t" + char + "\n") print("Ngram Counts Completed!")
def find_google_ngrams_word_count(word, time_function=False, verbose=False): if time_function == True: time1 = time.time() count = 2 # Set this to a minimum of 2 so we don't get a divide by zero error # TODO: Consider how we want to deal with capitalization fname, url, records = next(readline_google_store(ngram_len=1, indices=word[0])) # If we use the verbose settings, occaisionally print out the record verbosity_count = 1000000000 earliest_year = 1950 i = 0 try: record = next(records) while record.ngram != word: record = next(records) if verbose == True and i%verbosity_count == 0: print(record) i += 1 while record.ngram == word: if record.year >= earliest_year: count += record.match_count if verbose == True: print(record) record = next(records) except StopIteration: pass # Default to 1 so our program doesn't crash if count == 0: count = 1 if time_function == True: time2 = time.time() print('Total seconds for ' + word + ': ' + str(int((time2-time1)))) return count
def load_ngrams(my_ngram_len, indx): russian = "[А-Яа-я]+" tags = "ADJ|ADP|ADV|CONJ|NOUN|NUM|PRT|VERB" fname, url, records = next(readline_google_store(ngram_len=my_ngram_len, lang='rus', indices=[indx])) record = next(records) count = 0 with open(str(my_ngram_len) + 'grams-' + indx + '.tsv', 'w') as f: writer = csv.writer(f, delimiter='\t') while True: try: if record.year < 1918: record = next(records) else: if len(record.ngram) > 5 \ and re.search(russian, record.ngram) \ and not re.search(tags, record.ngram): writer.writerow([record.ngram, record.year, record.match_count, record.volume_count]) count += 1 record = next(records) else: record = next(records) except StopIteration: break print(str(count) + " " + indx + " " + str(my_ngram_len) + "grams saved") return 0
def count_ngrams(phrase, length, lang): """The raw data for unigrams has been downloaded locally, but not for bigrams or trigrams.""" count = 0 chinese_character_to_sound = { u'\u5341': 's', u'\u4e8c': 'e', u'\u4e09': 's', u'\u56db': 's', u'\u4e94': 'w', u'\u516d': 'l', u'\u4e03': 'q', u'\u516b': 'b', u'\u5e5d': 'j' } ngram_downloader_langcode = { "english": "eng", "chinese": "chi-sim", "french": "fre", "german": "ger", "hebrew": "heb", "italian": "ita", "russian": "rus", "spanish": "spa" } if lang == "chinese": index = chinese_character_to_sound[phrase[0].lower()] else: index = phrase[0].lower() all_combinations = get_combo(index, length) print(all_combinations) fname, url, records = next( readline_google_store(ngram_len=length, lang=ngram_downloader_langcode[lang], indices=all_combinations)) try: record = next(records) print(record.ngram) while record.ngram != phrase: record = next(records) print(record.ngram) while record.ngram == phrase: count += record.match_count record = next(records) print(record.ngram) except StopIteration: pass return count
def test_google_ngram_download(): from google_ngram_downloader import readline_google_store fname, url, records = next(readline_google_store(ngram_len=5)) debug('fname = ' + fname) debug('url = ' + url) record = next(records) #debug('next: ' + str(record)) #debug('next gram: ' + str(record.ngram.encode('utf-8'))) while record: ngram = record.ngram.encode('utf-8') if 'American' in ngram: debug('gram: ' + str(record)) record = next(records)
def processngrams(index): length = 3 try: ngram_dict = {} try: name, url, records = next( readline_google_store(ngram_len=length, indices=[index])) except: print('url not found') pass for record in records: if record.ngram in ngram_dict.keys(): ngram_dict[record.ngram] = ngram_dict[ record.ngram] + record.match_count else: ngram_dict[record.ngram] = record.match_count ngram_count = {} for key, value in ngram_dict.items(): new_key = [] for text in key.split(): new_key.append(text.split('_')[0]) new_key = ' '.join(new_key) if new_key in ngram_count.keys(): ngram_count[new_key] = ngram_count[new_key] + value else: ngram_count[new_key] = value filename = str(length) + '_' + index filepath = filename + '.json' with open(filepath, 'w') as fp: json.dump(ngram_count, fp) print(name) s3 = boto3.client('s3', region_name='ap-south-1', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) bucket = 'ei-marketingdata' s3_file = 'parentReport_test/{}'.format(filepath) s3.upload_file(filepath, bucket, s3_file) except Exception as e: print(e)
__author__ = 'pv' from google_ngram_downloader import readline_google_store from itertools import product from string import ascii_lowercase, digits import codecs letter_indices = ((''.join(i) for i in product(ascii_lowercase, ascii_lowercase + '_'))) letter_indices = (l for l in letter_indices if l != 'qk') fs = [] try: for year in range(1850, 2010): fs.append(codecs.open('google-ngrams/' + str(year), 'w', "utf-8")) except: print("couldnt open files.", year) i = 0 for fname, url, records in readline_google_store(ngram_len=5, lang='eng-fiction', indices=letter_indices): print (fname) for record in records: if 1850 <= record.year <= 2010: out = fs[record.year - 1850] out_str = record.ngram + "\t" + str(record.match_count) + "\n" out.write(out_str) for f in fs: f.close()
from google_ngram_downloader import readline_google_store # https://github.com/dimazest/google-ngram-downloader files = readline_google_store(ngram_len=5) f_bundle = next(files, None) sink = open("output.txt", "w") while f_bundle is not None: fname, url, records = f_bundle print(fname) r = next(records, None) text = "" count = 0 while r is not None: cText = r.ngram cCount = r.match_count cYear = r.year if cText != text: if count > 0: sink.write("{}\t{}\n".format(text, count)) count = 0 text = cText if cYear < 1980: # print(cText, cCount, cYear) count += cCount r = next(records, None)
import csv import os import string from ke_root import ROOT_OUTPUT from google_ngram_downloader import readline_google_store list_word = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'u','v', 'w', 'x', 'y', 'z', 'ch'] dict_ngram = {} for word in list_word: fnames, urls, records = next(readline_google_store(ngram_len=1, indices=word, lang='spa')) for i in records.__iter__(): ngram = str(i.ngram).lower() if ngram.find('_') == -1: if ngram in dict_ngram: temp = dict_ngram.get(ngram) freq = temp['freq'] + i.match_count count = temp['count'] + 1 dict_ngram[ngram] = {'freq': freq,'count': count} else: freq = i.match_count count = 1 dict_ngram[ngram] = {'freq':freq,'count': count} print('Calulated valued to ngram = {0}'.format(ngram)) result = {} for k, v in dict_ngram.items(): relative_freq = round(float(v['freq'] / v['count']), 2) result[k] = relative_freq print('ngrams = {0}, relative_freq = {1}'.format(k, relative_freq)) ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
import os import csv import string from google_ngram_downloader import readline_google_store list_word = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ch' ] dict_ngram = {} for word in list_word: fnames, urls, records = next( readline_google_store(ngram_len=1, indices=word, lang='spa')) for i in records.__iter__(): ngram = str(i.ngram).lower() if ngram.find('_') == -1: if ngram in dict_ngram: temp = dict_ngram.get(ngram) freq = temp['freq'] + i.match_count count = temp['count'] + 1 dict_ngram[ngram] = {'freq': freq, 'count': count} else: freq = i.match_count count = 1 dict_ngram[ngram] = {'freq': freq, 'count': count} print('Calulated valued to ngram = {0}'.format(ngram)) result = {} for k, v in dict_ngram.items(): relative_freq = round(float(v['freq'] / v['count']), 2) result[k] = relative_freq
import collections from google_ngram_downloader import readline_google_store """ Script for fetching and aggregating bigram data provided by Google: http://storage.googleapis.com/books/ngrams/books/datasetsv2.html Requires google_ngram_downloader library: https://pypi.python.org/pypi/google-ngram-downloader Usage: python count-google-bigrams.py """ if __name__ == "__main__": reload(sys) sys.setdefaultencoding('utf8') chunks = readline_google_store(ngram_len=2, lang='eng') for fileName, url, records in chunks: if fileName[-14:] == 'punctuation.gz': break print "Processing " + fileName + "..." counts = collections.defaultdict(int) for r in records: bigram = r.ngram # Ignore if containing part of speech tag or comma (later used as delimiter) if '_' not in bigram and ',' not in bigram: # Set to lowercase and split at space [i, j] = bigram.lower().split() counts[(i, j)] += r.match_count # Write counts to file per chunk output = codecs.open(fileName[:-3] + "-aggregated.txt", "w", "utf-8")
list_not = [ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '_ADJ_', '_ADP_', '_ADV_', '_CONJ_', '_DET_', '_NOUN_', '_NUM_', '_PRON_', '_PRT_', '_VERB_' ] ngrams = 3 result = {} list_indices = util.get_indices(ngrams) dict_ngram = {} for item in list_indices: if not (item in list_not): list_tmp = [] list_tmp.append(item) try: fnames, urls, records = next( readline_google_store(ngram_len=ngrams, indices=list_tmp, lang='spa')) for i in records: try: ngram = str(i.ngram).lower() # print(i) if ngram.find('_') == -1: if ngram in dict_ngram: temp = dict_ngram.get(ngram) freq = float(temp['freq'] + i.match_count) count = temp['count'] + 1 dict_ngram[ngram] = {'freq': freq, 'count': count} else: freq = 1 if str(i.match_count) == '' else float( i.match_count) count = 1
#!/usr/bin/env python # -*- coding:utf-8 -*- from google_ngram_downloader import readline_google_store fname, url, records = next(readline_google_store(ngram_len=5)) _debug = True #_debug = False def debug(*values): if _debug: log('debug', *values) def info(*values): log('info', *values) def log(level, *values): message = '[%s]' % level for value in values: message += '\t%s' % str(value) print message test_path = 'test_data/SystemOut.log' sw_list = [] def load_stop_words(): resource_list = ['resources/chinese_stopWord.txt', 'resources/english_stopWord.txt', 'resources/sign_stopWord.txt', 'resources/union_stopWord.txt'] # resource_list = ['resources/english_stopWord.txt'] for res in resource_list: f = open(res)
from google_ngram_downloader import readline_google_store fname, url, records = next(readline_google_store(ngram_len=1, indices='.')) for x in range(0, 5): print(next(records))
import json from kafka import SimpleProducer, KafkaClient from google_ngram_downloader import readline_google_store # To send messages synchronously kafka = KafkaClient("ec2-52-35-7-236.us-west-2.compute.amazonaws.com:9092") producer = SimpleProducer(kafka) gene = readline_google_store(ngram_len=1, lang="eng") while True: try: fname, url, records = next(gene) print url except StopIteration: print "END" break
from google_ngram_downloader import readline_google_store import wget import hadoopy import os for i in range(3, 6): gene = readline_google_store(ngram_len=i, lang='eng') while True: try: fname, url, records = next(gene) print fname if hadoopy.exists('/google-ngram/'+str(i)+'/'+fname): continue else: wget.download(url) hadoopy.put(fname, '/google-ngram/'+str(i)+'/'+fname) os.remove(fname) except StopIteration: print "END" break
from google_ngram_downloader import readline_google_store fname, url, records = next(readline_google_store(ngram_len=2)) import pymongo from pymongo import MongoClient client = MongoClient() db = client['ngrams'] import re def getEntry(d): ngram,year,match_count,volume_count=d[0:4] entry={ 'ngram':ngram, 'year':year, 'match_count':match_count, 'volume_count':volume_count } return entry inspect=[] counter=0 previous="" keep=0 target=[u'conspir',u'scheme',u'stratagem',u'machination',u'cabal',u'deception',u'deceit', u'deceive', u'ploy', u'ruse',u'dodge', u'subterfuge', u'complot',u'colluder', u'collusion', u'collaborator', u'conniver', u'machinator', u'traitor',u'connive'] #started=False for fname, url, records in readline_google_store(ngram_len=5,verbose=True): words_re = re.compile(r"|\b".join(target)) print fname #if 'ad.gz' in str(fname): # started=True
import sys import re from google_ngram_downloader import readline_google_store if __name__ == "__main__": n = sys.argv[1] position = sys.argv[2] with open(f'{position}-{n}-grams.txt', 'w') as f: for fname, url, records in readline_google_store(ngram_len=int(n)): if re.search(r'[b-z][a-z]\.gz$', fname): print(fname) counter = 0 for r in records: if counter % 1000000 == 0: print(r.ngram) if position == 'tailing': if re.search( r'^[a-zA-Z]', r.ngram) and r.ngram.endswith("._. _END_"): f.write('{}\n'.format(r.ngram)) if position == "inner": if all([ " ._. " in r.ngram, " ._. ]" not in r.ngram, " ._. /" not in r.ngram, " ._. *" not in r.ngram, not r.ngram.startswith("._."), not r.ngram.endswith("_END_"), not r.ngram.endswith("_."), ]):
import nltk import re import string # In this step, we load the `file name`,`url` of the ngram, and `record`. Records can be understood as the rows in # the ngram files. # # `lang = 'chi-sim'` means just load the Chinese-Simplified. # # ** Remember you need to be connected to internet for this program to work ** # # In[9]: fname, url, records = next(readline_google_store(ngram_len = 4, lang = 'chi-sim')) # You can look at the `url` and use it to download the ngram file, if you want! (NOT RECOMMENDED :)) # First, I defined an empty `dictionary` called `total` to store the records and their word counts. # # Next line, I set the `notEoF` variable to be `True`. This variable is supposed to be the flag for when we reach the end of the google ngram records. # # Next, I used the try/except structure that is used for error handling in Python. It reads the next record from google ngram. After the last record is read by the script, there will be an error. This rise of an error moves the program to the `except` section. In the `except` section, the `notEoF` becomes `False` and that stops the `while` loop in the `try` section. # # # In[10]: total = {}
with open(sys.argv[1]) as f: needed_ngrams = f.read().splitlines() needed_ngrams.sort() n=int(sys.argv[2]) #make a set of all indices that need to be downloaded needed_indices = set([x[:min(n,2)].lower() for x in needed_ngrams]) #print "Needed indices:",needed_indices #create a map with each needed ngram a key associated with a 0 value ngram_counts_dict = {} for needed_ngram in needed_ngrams: ngram_counts_dict[needed_ngram] = 0 #print ngram_counts_dict #for each indices, iterate over all entries for the index (lines aren't sorted) for index in needed_indices: fname, url, records = next(readline_google_store(ngram_len=n, indices=(index if n == 1 else [index]))) for record in records: #add counts for matching terms record_ngram = remove_tags(record.ngram) if record_ngram in ngram_counts_dict: ngram_counts_dict[record_ngram] += record.match_count #print counts for ngram,ngram_counts in ngram_counts_dict.iteritems(): print ngram, ngram_counts