def process_train_data(v: int, n: int, delta: float, vocab_size: int, train_file: str) -> Ngram: """ Wrapper function for the training data processing. Either fetch or generate necessary Ngrams based on the training information. :param v: Vocabulary choice :param n: ngram choice :param delta: Smoothing choice :param vocab_size: The size of the vocabulary :param train_file: Path to training data :return: Ngram """ ngrams = Ngram(n) if ds.data_ser_exists(v, n, delta): print("Model with parameters already stored. Retrieving") ngrams = ds.data_ser_load(v, n, delta) else: print( "Model with parameters not stored. Generating model from provided training data" ) train_data = pd.read_csv(train_file, delimiter='\t', names=[ DF_COLUMN_ID, DF_COLUMN_NAME, DF_COLUMN_LANG, DF_COLUMN_TWEET ]) transform_to_vocab(train_data, v) print("Shape of Training Data (Rows, Columns) => {}".format( train_data.shape)) ngrams.generate(train_data, delta, vocab_size) ds.data_ser_save(ngrams, v, n, delta) return ngrams
def getNgramsByWord(self, word, ngramSize): if not ngramSize: return [] term = Ngram(word, ngramSize) if term.deriveNgrams(): return term.getNgrams() else: return []
def __init__(self, maxNgramcount, content): self.maxNgramcount = maxNgramcount self.corpus = content print("Hece gramları oluşturuluyor") self.ngrams_hece = [(Ngram(i + 1, self.corpus, "hece", self)) for i in range(self.maxNgramcount)] print("---------------------------") print("Harf gramları oluşturuluyor") self.ngrams_harf = [(Ngram(i + 1, self.corpus, "harf", self)) for i in range(self.maxNgramcount)]
def exec_second(self, parole): e = EditDistance() a = Ngram() tempi = [] n_vicine_trovate = [] for parola in parole: with open('60000_parole_italiane.txt', 'r') as f: # print 'parola --> ', parola # edit distance # print '----- EDIT DISTANCE' e_results = [] start = timer() for line in f: p = line.rstrip() _, op = e.edit_distance(parola, p) costo = e.op_sequence(op, len(parola) - 1, len(p) - 1, []) if costo < self.sogliaCosto: e_results.append((p, costo)) end = timer() time_edit = end - start n_edit = len(e_results) # print 'risultati (%s)' % n_edit, '-->', sorted(e_results, key=get(1)) # print 'tempo -->', time_edit # ngrams # print '----- NGRAMS' g_results = [] b = a.ngram(parola, self.numberOfGrams) with open("%s_grams.txt" % self.numberOfGrams, 'r') as r: start = timer() for line in r: s = line.split(' -> ') p, g = s[0], s[1] f = a.jaccard(b, g) if f > self.sogliaJaccard: g_results.append((p, f)) end = timer() time_gram = end - start n_gram = len(g_results) # print 'risultati (%s)' % n_gram, '-->', sorted(g_results, key=get(1), reverse=True) # print 'tempo -->', time_gram # print '\n' tempi.append([time_edit, time_gram]) n_vicine_trovate.append([n_edit, n_gram]) return [tempi, n_vicine_trovate]
def readFile(): uni_tag = Ngram() bi_tag = Ngram() tri_tag = Ngram() for s in stdin: tags = [] words = [] if s != '' and s != '\n': s = '<s>/BOS <s>/BOS '+s.rstrip()+' </s>/EOS' lst = s.split() for wt in lst: #parse words and tags t = wt.split('/')[-1] tags.append(t) idx = wt.rfind(t) w = wt[:idx-1] words.append(w) for i in range(len(tags)):#go through the training data,add counts of tag ngrams and tag-word co-occurences t = tags[i] w = words[i] uni_tag.addEntry(t,w) if i <= len(tags)-2: seq = ' '.join(tags[i:i+2]) bi_tag.addCount(seq) if i <= len(tags)-3: seq2 = ' '.join(tags[i:i+3]) tri_tag.addCount(seq2) return uni_tag,bi_tag,tri_tag
def setUp(self): ngram_after_1 = self.mock_ngram('bar', 1, 2, 3) ngram_after_2 = self.mock_ngram('baz', 7, 8, 9) self.ngram = Ngram('foo', 1, 1) self.ngram.after[0][ngram_after_1.string] = ngram_after_1 self.ngram.after[0][ngram_after_2.string] = ngram_after_2 self.ngram.before[0][ngram_after_1.string] = ngram_after_1 self.ngram.before[0][ngram_after_2.string] = ngram_after_2
def parse(self, text): tokens = re.split('\s+', text) for wnum in xrange(0, len(tokens)): for ng_ord in xrange(1, self.max_order + 1): if wnum + ng_ord < len(tokens): words_tuple = tuple(tokens[wnum:wnum + ng_ord]) ngram = self.storage_.get_n_gram(words_tuple) if ngram == None: ngram = Ngram(1) else: ngram.count = ngram.count + 1 self.storage_.set_n_gram(words_tuple, ngram)
def parse(self, text): tokens = re.split('\s+', text) for wnum in xrange(0, len(tokens)): for ng_ord in xrange(1, self.max_order + 1): if wnum + ng_ord < len(tokens): words_tuple = tuple(tokens[wnum : wnum + ng_ord]) ngram = self.storage_.get_n_gram(words_tuple) if ngram == None: ngram = Ngram(1) else: ngram.count = ngram.count + 1 self.storage_.set_n_gram(words_tuple, ngram)
def test_hash_fn(): ngram1 = Ngram('a-rose-is') ngram2 = Ngram('rose-is-a') assert ngram1.__hash__() != ngram2.__hash__(), 'the two hashes should not be the same' print 'Ngrams with different string values give different hashes... ok' ngram2.value = 'a-rose-is' assert ngram1.__hash__() == ngram2.__hash__(), 'the two hashes should not be the same' print 'Ngrams with the same string values give the same hash... ok'
def data_ser_load(v: int, n: int, delta: float): """ loads the Ngram object, initializing DataFrames for each languages from proper files. :param v: Vocabulary for the model :param n: Ngram size for the model :param delta: Delta value for the model :return ngrams: Ngram object. """ ngrams = Ngram(n) for lang in LANGUAGES: ngrams.ngrams[lang] = pd.read_pickle( TRAINING_FILE_TEMPLATE.format(lang, v, n, delta)) return ngrams
def exec_fifth(self): e = EditDistance() a = Ngram() originale = raw_input("**** Inserisci parola --> ") parola = self.storpia(originale) print '**** Parola storpiata -->', parola # edit distance print '----- EDIT DISTANCE' # costi: 1, 2, 3, 4, 5 for c in range(1, 6): with open('60000_parole_italiane.txt', 'r') as f: e_results = [] for line in f: p = line.rstrip() _, op = e.edit_distance(parola, p) costo = e.op_sequence(op, len(parola) - 1, len(p) - 1, []) if costo < c: e_results.append((p, costo)) if any(originale in a for a in e_results): w = 'parola originale trovata!' else: w = 'parola originale non trovata!' print w, '(soglia costo %s, %s risultati)' % ( c, len(e_results)), '-->', sorted(e_results, key=get(1)) # ngram print '----- NGRAM' b = a.ngram(parola, self.numberOfGrams) # coefficienti: 0.5, 0.6, 0.7, 0.8, 0.9 for j in np.arange(0.5, 1.0, 0.1): with open("%s_grams.txt" % self.numberOfGrams, 'r') as f: g_results = [] for line in f: s = line.split(' -> ') p, g = s[0], s[1] f = a.jaccard(b, g) if f > j: g_results.append((p, f)) if any(originale in a for a in g_results): w = 'parola originale trovata!' else: w = 'parola originale non trovata!' print w, '(jaccard %s, %s risultati)' % ( j, len(g_results)), '-->', sorted(g_results, key=get(1), reverse=True)
def getNgramsByLine(self, ngramSize): if not ngramSize: return [] occurency = [] # split the given text into single lines lines = self.splitParagraph() for line in lines: term = Ngram(line, ngramSize) if term.deriveNgrams(): occurency.append(term.getNgrams()) else: occurency.append([]) return occurency
def ngramStemmer (self, wordList, size, equality): "reduces wordList according to the n-gram stemming method" # use return_list and stop_list for the terms to be removed, later returnList = [] stopList = [] ngramAdvas = Advas("","") # calculate length and range listLength = len(wordList) outerListRange = range(0, listLength) for i in outerListRange: term1 = wordList[i] innerListRange = range (0, i) # define basic n-gram object term1Ngram = Ngram(term1, 2) term1Ngram.deriveNgrams() term1NgramList = term1Ngram.getNgrams() for j in innerListRange: term2 = wordList[j] term2Ngram = Ngram(term2, 2) term2Ngram.deriveNgrams() term2NgramList = term2Ngram.getNgrams() # calculate n-gram value ngramSimilarity = ngramAdvas.compareNgramLists (term1NgramList, term2NgramList) # compare degree = ngramSimilarity - equality if (degree>0): # ... these terms are so similar that they can be conflated # remove the longer term, keep the shorter one if (len(term2)>len(term1)): stopList.append(term2) else: stopList.append(term1) # end if # end if # end for # end for # conflate the matrix # remove all the items which appear in stopList return list(set(wordList) - set(stopList))
def do(n): with open('60000_parole_italiane.txt', 'r') as f: with open("%s_grams.txt" % n, 'w+') as r: for line in f: p = line.rstrip() g = Ngram().ngram(p, n) r.write("%s -> %s\n" % (p, g))
def main(): from ngram import Ngram from model import Model from forest import Forest flags.DEFINE_integer("beam", 100, "beam size", short_name="b") flags.DEFINE_integer("debuglevel", 0, "debug level") flags.DEFINE_boolean("mert", True, "output mert-friendly info (<hyp><cost>)") flags.DEFINE_boolean("cube", True, "using cube pruning to speedup") flags.DEFINE_integer("kbest", 1, "kbest output", short_name="k") flags.DEFINE_integer("ratio", 3, "the maximum items (pop from PQ): ratio*b", short_name="r") argv = FLAGS(sys.argv) weights = Model.cmdline_model() lm = Ngram.cmdline_ngram() false_decoder = CYKDecoder(weights, lm) def non_local_scorer(cedge, ders): (lmsc, alltrans, sig) = false_decoder.deltLMScore(cedge.lhsstr, ders) fv = Vector() fv["lm"] = lmsc return ((weights.dot(fv), fv), alltrans, sig) cube_prune = CubePruning(FeatureScorer(weights), non_local_scorer, FLAGS.k, FLAGS.ratio) for i, forest in enumerate(Forest.load("-", is_tforest=True, lm=lm), 1): a = false_decoder.beam_search(forest, b = FLAGS.beam) b = cube_prune.run(forest.root) assert a[0], b[0].score[0] assert a[1], b[0].score[1] print a print b[0]
def add_ngram(self, ngram, tree=None): tree = self.tree if tree is None else tree if ngram in tree: tree[ngram].count += 1 else: tree[ngram] = Ngram(ngram, self.hindsight, self.foresight)
def eat_token_string(self, s, max_reach=2, max_ngram_size=2): for ngram_size in range(1, max_ngram_size + 1): for i in range(len(s)): start = i end = i + ngram_size if start >= 0 and end < len(s) + 1: before, current, after = s[:start], s[start:end], s[end:] if len(current) == 1: self.wordcount += 1 ngram = " ".join(current) if ngram in self.tree: self.tree[ngram].count += 1 else: self.tree[ngram] = Ngram(ngram, 1, max_reach) for reach in range(1, max_reach + 1): # update dictionary to reflect all words occurring after this ngram try: word = after[reach - 1] #print 'after "%s" is "%s" with reach %s' % (ngram, word, reach) self.tree[ngram].add_after(word, reach, 1) except IndexError: pass """
def add_ngram(self, ngram, tree=None): """Adds an ngram to a given tree""" tree = self.tree if tree is None else tree if ngram in tree: tree[ngram].count += 1 else: tree[ngram] = Ngram(ngram, self.hindsight, self.foresight)
def exec_third(self): e = EditDistance() a = Ngram() costi = [] coefficienti = [] risultati_edit = [] risultati_gram = [] parola = raw_input("**** Inserisci parola --> ") # edit distance # print '----- EDIT DISTANCE' # costi: 1, 2, 3, 4, 5 for c in range(1, 6): costi.append(c) with open('60000_parole_italiane.txt', 'r') as f: e_results = [] for line in f: p = line.rstrip() _, op = e.edit_distance(parola, p) costo = e.op_sequence(op, len(parola) - 1, len(p) - 1, []) if costo < c: e_results.append((p, costo)) risultati_edit.append(len(e_results)) # print 'ho trovato %s risultati per soglia costo %s' % (len(e_results), c), '-->', sorted(e_results, key=get(1)) # ngram # print '----- NGRAM' b = a.ngram(parola, self.numberOfGrams) # coefficienti: 0.5, 0.6, 0.7, 0.8, 0.9 for j in np.arange(0.5, 1.0, 0.1): coefficienti.append(j) with open("%s_grams.txt" % self.numberOfGrams, 'r') as f: g_results = [] for line in f: s = line.split(' -> ') p, g = s[0], s[1] f = a.jaccard(b, g) if f > j: g_results.append((p, f)) risultati_gram.append(len(g_results)) # print 'ho trovato %s risultati per jaccard maggiore di %s' % (len(g_results), j), '-->', sorted(g_results, key=get(1), reverse=True) return [costi, coefficienti, risultati_edit, risultati_gram]
def calculate_ngrams(self, doc, length=3): num_terms = len(doc) ngrams = [] for t in xrange(num_terms): if num_terms <= t + length - 1: break # n-2 ngrams! ngram_tokens = doc[t:t + length] ngram_value = "-".join(ngram_tokens) ngram = Ngram(ngram_value) ngrams.append(ngram) return ngrams
def test_simple(self): # tc = unittest.TestCase() words1 = ("hello", "<punc>", "world") words2 = ("hello", "<punc>", "underworld") words3 = ("hello", "<punc>", "John") words4 = ("goodbye", "<punc>", "John") ng = NgramStorage(3) ng.set_n_gram(words1[0], Ngram(12, 0.1)) ng.set_n_gram(words1[0:1], Ngram(10, 0.08)) ng.set_n_gram(words1, Ngram(4, 0.02)) ng.set_n_gram(words2, Ngram(4, 0.02)) ng.set_n_gram(words3, Ngram(2, 0.01)) ng.set_n_gram(words4, Ngram(8, 0.05)) epsilon = 0.00001 self.assertEqual(8, ng.get_n_gram(words4).count) self.assertLess(abs(0.05 - ng.get_n_gram(words4).prob), epsilon) self.assertEqual(4, ng.get_n_gram(words1).count) self.assertLess(abs(0.02 - ng.get_n_gram(words1).prob), epsilon) self.assertIsNone(ng.get_n_gram(words4[0])) self.assertIsNone(ng.get_n_gram(words4[0:1])) self.assertEqual(4, len(ng.get_n_grams(3))) self.assertEqual(3, ng.max_order())
def exec_first(self): with open('60000_parole_italiane.txt', 'r') as f: e = EditDistance() a = Ngram() lines = f.readlines() rand = random.randint(0, len(lines)) word = lines[rand].rstrip() print 'random word -->', word # test edit distance start = timer() for line in lines: p = line.rstrip() if p == word: break _, op = e.edit_distance(word, p) _ = e.op_sequence(op, len(word) - 1, len(p) - 1, []) end = timer() time_edit = end - start # print 'tempo trascorso edit distance -->', time_edit # test ngrams b = a.ngram(word, self.numberOfGrams) with open("%s_grams.txt" % self.numberOfGrams, 'r') as r: start = timer() for line in r: s = line.split(' -> ') p, g = s[0], s[1] if p == word: break _ = a.jaccard(b, g) end = timer() time_ngram = end - start # print 'tempo trascorso ngrams -->', time_ngram return [word, time_edit, time_ngram]
def calcSuccVariety(self): # derive two-letter combinations ngramObject = Ngram(self.term, 2) ngramObject.deriveNgrams() ngramSet = set(ngramObject.getNgrams()) # count appearances of the second letter varietyList = {} for entry in ngramSet: letter1 = entry[0] letter2 = entry[1] if varietyList.has_key(letter1): items = varietyList[letter1] if not letter2 in items: # extend the existing one items.append(letter2) varietyList[letter1] = items else: # create a new one varietyList[letter1] = [letter2] return varietyList
def main(args): print(f'Loading corpus from `{args.data}`...') corpus = Corpus(args.data, order=args.order, lower=args.lower, max_lines=args.max_lines) model = Ngram(order=args.order) name = f'{args.name}.{args.order}gram' print('Example data:') print('Train:', corpus.train[:20]) print('Valid:', corpus.valid[:20]) print('Training model...') model.train(corpus.train, add_k=args.add_k, interpolate=args.interpolate, backoff=args.backoff) print(f'Vocab size: {len(model.vocab):,}') if args.save_arpa: print(f'Saving model to `{name}`...') model.save_arpa(name) assert model.sum_to_one(n=10) print('Generating text...') text = model.generate(100) text = ' '.join(text) path = os.path.join(args.out, f'generated.{name}.txt') print(text) with open(path, 'w') as f: print(text, file=f) if model.is_smoothed: print('\nPredicting test set NLL...') logprob = model(corpus.test) nll = -logprob / len(corpus.test) print(f'Test NLL: {nll:.2f} | Perplexity {exp(nll):.2f}') path = os.path.join(args.out, f'result.{name}.txt') with open(path, 'w') as f: print(f'Test NLL: {nll:.2f} | Perplexity {exp(nll):.2f}', file=f) else: exit( 'No evaluation with unsmoothed model: probability is probably 0 anyways.' )
def exec_fourth(self): a = Ngram() risultati = [] # parola = raw_input("**** Inserisci parola --> ") parole = [] with open('60000_parole_italiane.txt', 'r') as f: lines = f.readlines() for i in range(3): rand = random.randint(0, len(lines)) word = lines[rand].rstrip() while len(word) < 5: rand = random.randint(0, len(lines)) word = lines[rand].rstrip() parole.append(word) # grammi: 2, 3, 4 for parola in parole: subarray = [] for n in range(2, 5): with open("%s_grams.txt" % n, 'r') as f: b = a.ngram(parola, n) g_results = [] for line in f: s = line.split(' -> ') p, g = s[0], s[1] f = a.jaccard(b, g) if f > self.sogliaJaccard: g_results.append((p, f)) # print 'ho trovato %s risultati per %s grammi' % (len(g_results), n), '-->', sorted(g_results, key=get(1), reverse=True) subarray.append(len(g_results)) risultati.append(subarray) return [parole, risultati]
class NgramTests(unittest.TestCase): def mock_ngram(self, string, count, frequency, sig_score): ngram = Ngram(string) ngram.count = count ngram.frequency = frequency ngram.sig_score = sig_score return ngram def setUp(self): ngram_after_1 = self.mock_ngram('bar', 1, 2, 3) ngram_after_2 = self.mock_ngram('baz', 7, 8, 9) self.ngram = Ngram('foo', 1, 1) self.ngram.after[0][ngram_after_1.string] = ngram_after_1 self.ngram.after[0][ngram_after_2.string] = ngram_after_2 self.ngram.before[0][ngram_after_1.string] = ngram_after_1 self.ngram.before[0][ngram_after_2.string] = ngram_after_2 def test_get_after__sort_attribute_count(self): self.assertEqual( self.ngram.get_after(sort_attribute="count"), [('baz', 7), ('bar', 1)] ) def test_get_after__sort_attribute_frequency(self): self.assertEqual( self.ngram.get_after(sort_attribute="frequency"), [('baz', 8), ('bar', 2)] ) def test_get_after__sort_attribute_sig_score(self): self.assertEqual( self.ngram.get_after(sort_attribute="sig_score"), [('baz', 9), ('bar', 3)] ) def test_get_before__sort_attribute_count(self): self.assertEqual( self.ngram.get_before(sort_attribute="count"), [('baz', 7), ('bar', 1)] ) def test_get_before__sort_attribute_frequency(self): self.assertEqual( self.ngram.get_before(sort_attribute="frequency"), [('baz', 8), ('bar', 2)] ) def test_get_before__sort_attribute_sig_score(self): self.assertEqual( self.ngram.get_before(sort_attribute="sig_score"), [('baz', 9), ('bar', 3)] )
def test(): ng = Ngram() # Your n-gram model is trained with a text file # ng.train('data/wiki-ja-train.word') ng.train(args.train_file) # You can save your trained model as text. Currently, we do not support loading trained model. # ng.dump('trained/wiki_ja_train_trained_model', n=1) ng.dump('{}-{}gram'.format(args.dump_file, args.N), n=args.N)
def make_ngram(ngrams, splited, n, n_doc): tmp = [] for i in range(0, len(splited)): for j in range(0, n): if i + j < len(splited): tmp.append(splited[i + j]) if len(tmp) == n: key = ' '.join(tmp) if key in ngrams: ngrams[key].occu_tot += 1 ngrams[key].docs.add(n_doc) else: ngram = Ngram(tmp, n_doc) ngrams[key] = ngram tmp = [] return ngrams
def enter_sequence(self, ngram, count, tree): components = ngram.split(' ') head = " ".join(components[:-1]) tail = components[-1] if head in tree: tree[head].count += count else: tree[head] = Ngram(ngram, count, 1, 0) self.wordcount += count * len(components) branch = tree[head].after[0] if tail in branch: branch[tail] += count else: branch[tail] = count
def main(): from ngram import Ngram from model import Model from forest import Forest flags.DEFINE_integer("beam", 100, "beam size", short_name="b") flags.DEFINE_integer("debuglevel", 0, "debug level") flags.DEFINE_boolean("mert", True, "output mert-friendly info (<hyp><cost>)") flags.DEFINE_boolean("cube", True, "using cube pruning to speedup") flags.DEFINE_integer("kbest", 1, "kbest output", short_name="k") flags.DEFINE_integer("ratio", 3, "the maximum items (pop from PQ): ratio*b", short_name="r") argv = FLAGS(sys.argv) [outfile] = argv[1:] weights = Model.cmdline_model() lm = Ngram.cmdline_ngram() false_decoder = CYKDecoder(weights, lm) out = utility.getfile(outfile, 1) old_bleu = Bleu() new_bleu = Bleu() for i, forest in enumerate(Forest.load("-", is_tforest=True, lm=lm), 1): oracle_forest, oracle_item = oracle_extracter(forest, weights, false_decoder, 100, 2, extract=100) print >>sys.stderr, "processed sent %s " % i oracle_forest.dump(out) bleu, hyp, fv, edgelist = forest.compute_oracle(weights, 0.0, 1) forest.bleu.rescore(hyp) old_bleu += forest.bleu forest.bleu.rescore(oracle_item[0].full_derivation) new_bleu += forest.bleu bad_bleu, _, _, _ = oracle_forest.compute_oracle(weights, 0.0, -1) #for i in range(min(len(oracle_item), 5)): # print >>sys.stderr, "Oracle Trans: %s %s %s" %(oracle_item[i].full_derivation, oracle_item[i].score, str(oracle_item[i].score[2])) # print >>sys.stderr, "Oracle BLEU Score: %s"% (forest.bleu.rescore(oracle_item[i].full_derivation)) print >>sys.stderr, "Oracle BLEU Score: %s"% (forest.bleu.rescore(oracle_item[0].full_derivation)) print >>sys.stderr, "Worst new Oracle BLEU Score: %s"% (bad_bleu) print >>sys.stderr, "Old Oracle BLEU Score: %s"% (bleu) print >>sys.stderr, "Running Oracle BLEU Score: %s"% (new_bleu.compute_score()) print >>sys.stderr, "Running Old Oracle BLEU Score: %s"% (old_bleu.compute_score())
def __init__( self, nnet_fname, scaler_fname, labels_fname, ngram_fname, logbase=1, loglevel=logging.INFO, ): self.nnet_fname = nnet_fname self.scaler_fname = scaler_fname self.labels_fname = labels_fname self.ngram_fname = ngram_fname self.logbase = logbase self.loglevel = loglevel self.loglevelname = logging._levelToName[loglevel].lower() Bantry.scaler = ScalerFactory(scaler_fname) Bantry.classifier = Classifier(nnet_fname, labels_fname, logbase=logbase) self.ng = Ngram(ngram_fname) Bantry.ngram = self.ng GramGraph.set_ngram(self.ng)
#coding:utf-8 from Dictionary import Dictionary from ngram import Ngram dict1 = Dictionary("dict.txt") while(True): ngram1 =Ngram(dict1) sentence = raw_input("please input a Chinese Sentence:").decode("cp936"); segmap=ngram1.getSeg(sentence) for sg in segmap: print(sg) #for eachkey in segmap: # if(isinstance(segmap[eachkey],tuple)): # print (eachkey+":"+segmap[eachkey][0]+','+segmap[eachkey][1]) # else: # print (eachkey+":"+segmap[eachkey]) #printSeg(segmap,sentence) #print segmap
def mock_ngram(self, string, count, frequency, sig_score): ngram = Ngram(string) ngram.count = count ngram.frequency = frequency ngram.sig_score = sig_score return ngram
def main(): gc.set_threshold(100000, 10, 10) flags.DEFINE_integer("beam", 100, "beam size", short_name="b") flags.DEFINE_integer("debuglevel", 0, "debug level") flags.DEFINE_boolean("mert", True, "output mert-friendly info (<hyp><cost>)") flags.DEFINE_boolean("cube", True, "using cube pruning to speedup") flags.DEFINE_integer("kbest", 1, "kbest output", short_name="k") flags.DEFINE_integer("ratio", 3, "the maximum items (pop from PQ): ratio*b", short_name="r") flags.DEFINE_boolean("dist", False, "ditributed (hadoop) training)") flags.DEFINE_string("prefix", "", "prefix for distributed training") flags.DEFINE_string("hadoop_weights", "", "hadoop weights (formatted specially)") flags.DEFINE_boolean("add_features", False, "add features to training data") flags.DEFINE_boolean("prune_train", False, "prune before decoding") flags.DEFINE_boolean("no_lm", False, "don't use the unigram language model") flags.DEFINE_boolean("pickleinput", False, "assumed input is pickled") flags.DEFINE_string("oracle_forests", None, "oracle forests", short_name="o") flags.DEFINE_string("feature_map_file", None, "file with the integer to feature mapping (for lbfgs)") flags.DEFINE_boolean("cache_input", False, "cache input sentences (only works for pruned input)") flags.DEFINE_string("rm_features", None, "list of features to remove") flags.DEFINE_boolean("just_basic", False, "remove all features but basic") argv = FLAGS(sys.argv) if FLAGS.weights: weights = Model.cmdline_model() else: vector = Vector() assert glob.glob(FLAGS.hadoop_weights) for file in glob.glob(FLAGS.hadoop_weights): for l in open(file): if not l.strip(): continue f, v = l.strip().split() vector[f] = float(v) weights = Model(vector) rm_features = set() if FLAGS.rm_features: for l in open(FLAGS.rm_features): rm_features.add(l.strip()) lm = Ngram.cmdline_ngram() if FLAGS.no_lm: lm = None if argv[1] == "train": local_decode = ChiangPerceptronDecoder(weights, lm) elif argv[1] == "sgd" or argv[1] == "crf": local_decode = MarginalDecoder(weights, lm) else: local_decode = MarginalDecoder(weights, lm) if FLAGS.add_features: tdm = local_features.TargetDataManager() local_decode.feature_adder = FeatureAdder(tdm) local_decode.prune_train = FLAGS.prune_train local_decode.use_pickle = FLAGS.pickleinput local_decode.cache_input = FLAGS.cache_input print >> logs, "Cache input is %s" % FLAGS.cache_input if FLAGS.debuglevel > 0: print >> logs, "beam size = %d" % FLAGS.beam if argv[1] == "train": if not FLAGS.dist: perc = trainer.Perceptron.cmdline_perc(local_decode) else: train_files = [FLAGS.prefix + file.strip() for file in sys.stdin] perc = distributed_trainer.DistributedPerceptron.cmdline_perc(local_decode) perc.set_training(train_files) perc.train() elif argv[1] == "sgd": crf = sgd.BaseCRF.cmdline_crf(local_decode) crf.set_oracle_files([FLAGS.oracle_forests]) crf.train() elif argv[1] == "crf": if not FLAGS.dist: crf = CRF.LBFGSCRF.cmdline_crf(local_decode) crf.set_oracle_files([FLAGS.oracle_forests]) crf.set_feature_mappers(add_features.read_features(FLAGS.feature_map_file)) crf.rm_features(rm_features) if FLAGS.just_basic: print "Enforcing Basic" crf.enforce_just_basic() crf.train() else: # train_files = [FLAGS.prefix+file.strip() for file in sys.stdin] # oracle_files = [file+".oracle" for file in train_files] print >> sys.stderr, "DistributedCRF" crf = distCRF.DistributedCRF.cmdline_distibuted_crf(local_decode) # os.system("~/.python/bin/dumbo rm train_input -hadoop /home/nlg-03/mt-apps/hadoop/0.20.1+169.89/") # os.system("~/.python/bin/dumbo put "+crf.trainfiles[0]+" train_input -hadoop /home/nlg-03/mt-apps/hadoop/0.20.1+169.89/") crf.set_feature_mappers(add_features.read_features(FLAGS.feature_map_file)) crf.rm_features(rm_features) if FLAGS.just_basic: print "Enforcing Basic" crf.enforce_just_basic() # crf.set_oracle_files(oracle_files) crf.train() else: if not FLAGS.dist: print "Evaluating" eval = Evaluator(local_decode, [FLAGS.dev]) eval.tune() else: dev_files = [FLAGS.prefix + file.strip() for file in sys.stdin] eval = Evaluator(local_decode, dev_files) print eval.eval(verbose=True).compute_score()
flags.DEFINE_string("rulefilter", None, "filter ruleset") flags.DEFINE_integer("max_height", 3, "maximum height of lhs for pattern-matching") flags.DEFINE_integer("example_limit", 1e10, "number of examples to use") flags.DEFINE_float("hope", 0, "hope weight") flags.DEFINE_boolean("mert", True, "output mert-friendly info (<hyp><cost)") from ngram import Ngram # defines --lm and --order argv = FLAGS(sys.argv) weights = Model.cmdline_model() lm = Ngram.cmdline_ngram() # if FLAGS.lm is None then returns None if lm: weights["lm1"] = weights["lm"] * FLAGS.lmratio reffiles = [open(f) for f in argv[1:]] convert_forest = ((FLAGS.ruleset is not None) or (FLAGS.rulefilter is not None) ) if FLAGS.ruleset is not None: ruleset = RuleSet(FLAGS.ruleset) if FLAGS.phrase is not None: ruleset.add_bp(FLAGS.phrase) Forest.globalruleid = ruleset.rule_num()
from ngram import Ngram from model import Model from forest import Forest flags.DEFINE_integer("beam", 100, "beam size", short_name="b") flags.DEFINE_integer("debuglevel", 0, "debug level") flags.DEFINE_boolean("mert", True, "output mert-friendly info (<hyp><cost>)") flags.DEFINE_boolean("cube", True, "using cube pruning to speedup") flags.DEFINE_integer("kbest", 1, "kbest output", short_name="k") flags.DEFINE_integer("ratio", 3, "the maximum items (pop from PQ): ratio*b", short_name="r") argv = FLAGS(sys.argv) weights = Model.cmdline_model() lm = Ngram.cmdline_ngram() decoder = CYKDecoder(weights, lm) tot_bleu = Bleu() tot_score = 0. tot_time = 0. tot_len = tot_fnodes = tot_fedges = 0 tot_lmedges = 0 tot_lmnodes = 0 if FLAGS.debuglevel > 0: print >>logs, "beam size = %d" % FLAGS.beam for i, forest in enumerate(Forest.load("-", is_tforest=True, lm=lm), 1):
v=self.mDict.getPvalue(sentence) if((v)>maxPvalue and self.mDict.isAWord(sentence)): self.valueMap[sentence]=v self.segMap[sentence]=sentence return v else: self.valueMap[sentence]=maxPvalue self.segMap[sentence]=wordPair return maxPvalue def getSeg(self): return self.segMap if(__name__ =="__main__"): ngram1 = Ngram("dict1") print ngram1.splitsentence("ABC") from Dictionary import Dictionary from ngram import Ngram def printSeg(segMap,sentence): if(segMap.has_key(sentence)): pair = segMap[sentence] if(isinstance(pair,tuple)): printSeg(segMap,pair[0]) printSeg(segMap,pair[1]) else: if(sentence==pair): print sentence
def increment_tree(self, ngram, count, tree, max_ngram_size): if ngram in tree: tree[ngram].count += count else: tree[ngram] = Ngram(ngram, count, max_ngram_size)
#!/usr/bin/env python3 import math from util import tokenize_data from ngram import Ngram import csv import os.path if __name__ == '__main__': train_filename = '../data/AllCommitAddLines.txt' train_data = tokenize_data(train_filename) print(train_data) ngram = Ngram(3) print("TRAINING STARTED...") list_of_bigrams, unigram_counts, bigram_counts, list_of_trigrams, trigram_counts = ngram.train( train_data) one_gram_prob = ngram.calculate_onegram_prob(unigram_counts) bigram_prob = ngram.calculate_bigram_prob(list_of_bigrams, unigram_counts, bigram_counts) trigram_prob = ngram.calculate_trigram_prob(list_of_trigrams, bigram_counts, trigram_counts) with open("input_csv_file_path.csv", 'r') as csvinput: with open("output_csv_file_path.csv", 'w') as csvoutput: writer = csv.writer(csvoutput, lineterminator='\n') reader = csv.reader(csvinput) all = [] row = next(reader) # print(row[0]) row.append('NGLP') all.append(row)