def save_word2vec_format(self, fname, fvocab=None, binary=False): """ Store the input-hidden weight matrix in the same format used by the original C word2vec-tool, for compatibility. """ if fvocab is not None: logger.info("Storing vocabulary in %s" % (fvocab)) with utils.smart_open(fvocab, 'wb') as vout: for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count))) logger.info("storing %sx%s projection weights into %s" % (len(self.vocab), self.layer1_size, fname)) assert (len(self.vocab), self.layer1_size) == self.syn0.shape with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % self.syn0.shape)) # store in sorted order: most frequent words at the top for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): row = self.syn0[vocab.index] if binary: fout.write(utils.to_utf8(word) + b" " + row.tostring()) else: fout.write( utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))
def main(): utils.unicodefy_std_io() parser = argparse.ArgumentParser( description='Decodes (and verifies) an atlassian license') parser.add_argument( '--version', action='version', version='Atlassian License Decoder {}'.format(__version__)) parser.add_argument( '-k', '--key', '--public-key', default='atlassian.pem', help= 'a key file (contains at least a public DSA key) used to verify license (default: ' '%(default)s)') parser.add_argument('-V', '--no-verify', dest='verify', action='store_false', help='skip license verification step') parser.add_argument( '-i', '--input', default=utils.STD_IO_MARK, help='from where to read license, default "%(default)s" means stdin') parser.add_argument( '-o', '--output', default=utils.STD_IO_MARK, help= 'where to save the decoded license, default "%(default)s" means stdout' ) unicode_args = map(lambda s: unicode(s, sys.getfilesystemencoding()), sys.argv) args = parser.parse_args(unicode_args[1:]) with utils.smart_open(args.input, mode='rb') as f: atlassian_license = f.read() decoder = AtlassianLicenseDecoder(args.key) decompressed_content, verified = decoder.decode(atlassian_license, need_verify=args.verify) with utils.smart_open(args.output, mode='wb') as f: f.write(decompressed_content) if verified is not None and not verified: print( '\nWARNING: the license can NOT be verified by the given public key', file=sys.stderr)
def main(): parser = argparse.ArgumentParser() parser.add_argument('input_file', nargs='+', help='Input files') parser.add_argument('-o', '--output-file', default='-', help='Output path') args = parser.parse_args() with utils.smart_open(args.output_file, 'w') as output_file: for input_path in args.input_file: with utils.smart_open(input_path, 'r') as input_file: parser = Parser(input_file) stmts = parser.parse() code_gen = CodeGenerator('quad') code_gen.gen(stmts)
def main(): params = argparse.ArgumentParser(description='Alignment CLI') add_parameters(params) args = params.parse_args() trg_lengths = [len(x) for x in read_sentences(args.target)] src_lengths = [len(x) for x in read_sentences(args.source)] alignments, is_multiline = read_alignment_file(path=args.alignment, trg_lengths=trg_lengths, src_lengths=src_lengths) alignments = process_alignments( alignments=alignments, unaligned_target=args.unaligned_target, multiply_aligned_target=args.multiply_aligned_target, eps_index=args.unaligned_target_epsilon_index, bbn_multiply_aligned_target=args.bbn_multiply_aligned_target, bbn_unaligned_target=args.bbn_unaligned_target) if args.output_format is None: flat_output = not is_multiline else: flat_output = True if args.output_format == "flat" else False output_stream = sys.stdout if args.output is None else smart_open( args.output, mode='wt') print_alignments(alignments=alignments, stream=output_stream, print_unaligned_target=True if args.unaligned_target == "keep" else False, eps_index=args.unaligned_target_epsilon_index, flat=flat_output)
def __iter__(self): # the entire corpus is one gigantic line -- there are no sentence marks at all # so just split the sequence of tokens arbitrarily: 1 sentence = 1000 tokens sentence, rest, max_sentence_length = [], b'', 1000 with utils.smart_open(self.fname) as fin: while True: text = rest + fin.read( 8192) # avoid loading the entire file (=1 line) into RAM if text == rest: # EOF sentence.extend( rest.split() ) # return the last chunk of words, too (may be shorter/longer) if sentence: yield sentence break last_token = text.rfind( b' ' ) # the last token may have been split in two... keep it for the next iteration words, rest = ( utils.to_unicode(text[:last_token]).split(), text[last_token:].strip()) if last_token >= 0 else ([], text) sentence.extend(words) while len(sentence) >= max_sentence_length: yield sentence[:max_sentence_length] sentence = sentence[max_sentence_length:]
def docbyoffset(self, offset): """Return document at file offset `offset` (in bytes)""" # empty documents are not stored explicitly in MM format, so the index marks # them with a special offset, -1. if offset == -1: return [] if isinstance(self.input, string_types): fin = utils.smart_open(self.input) else: fin = self.input fin.seek(offset) # works for gzip/bz2 input, too previd, document = -1, [] for line in fin: docid, termid, val = line.split() if not self.transposed: termid, docid = docid, termid docid, termid, val = int(docid) - 1, int(termid) - 1, float( val ) # -1 because matrix market indexes are 1-based => convert to 0-based assert previd <= docid, "matrix columns must come in ascending order" if docid != previd: if previd >= 0: return document previd = docid document.append(( termid, val, )) # add another field to the current document return document
def read_alignment_file(path, trg_lengths, src_lengths): """ read flat alignment file :param path: path to alignment file :param trg_lengths: array of target lengths (for each sentence) :param src_lengths: array of source lengths (for each sentence) :return: array of alignments (unprocessed) """ check_condition( len(trg_lengths) == len(src_lengths), "source and target sentences must be parallel") file = smart_open(path) content = file.readlines() if len(content) == len(trg_lengths): is_multiline = False alignments = _read_flat_alignment_file(content=content, trg_lengths=trg_lengths) else: is_multiline = True alignments = _read_multiline_alignment_file(content=content, trg_lengths=trg_lengths) check_condition( len(alignments) == len(trg_lengths), "alignment mst be parallel") return alignments, is_multiline
def readArk(filename, limit=numpy.inf, memmap_dir='', memmap_dtype='float32'): """ Reads the features in a Kaldi ark file. Returns a list of feature matrices and a list of the utterance IDs. """ features = [] uttids = [] with smart_open(filename, "rb") as f: while True: try: uttid = readString(f).decode('utf-8') except ValueError: break feature = readMatrix(f) # use a memmap dir to hold the array content on a disk (e.g. ssd cache that is larger than your main memory) if memmap_dir != '': feature_mmap = numpy.memmap(memmap_dir + '/' + uttid, dtype=memmap_dtype, mode='w+', shape=feature.shape) feature_mmap[:] = feature[:] feature_mmap.flush() features.append(feature_mmap) del feature else: features.append(feature) uttids.append(uttid) if len(features) == limit: break if memmap_dir != '': with io.open(memmap_dir + '/' + 'feature_map', 'w') as feature_map: for uttid, feature in zip(uttids, features): feature_map.write(uttid + " %i %i\n" % (feature.shape[0], feature.shape[1])) return features, uttids
def docbyoffset(self, offset): """Return document at file offset `offset` (in bytes)""" # empty documents are not stored explicitly in MM format, so the index marks # them with a special offset, -1. if offset == -1: return [] if isinstance(self.input, string_types): fin = utils.smart_open(self.input) else: fin = self.input fin.seek(offset) # works for gzip/bz2 input, too previd, document = -1, [] for line in fin: docid, termid, val = line.split() if not self.transposed: termid, docid = docid, termid docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) # -1 because matrix market indexes are 1-based => convert to 0-based assert previd <= docid, "matrix columns must come in ascending order" if docid != previd: if previd >= 0: return document previd = docid document.append((termid, val,)) # add another field to the current document return document
def __init__(self, fname): self.fname = fname if fname.endswith(".gz") or fname.endswith('.bz2'): raise NotImplementedError( "compressed output not supported with MmWriter") self.fout = utils.smart_open( self.fname, 'wb+') # open for both reading and writing self.headers_written = False
def writeScp(filename, uttids, pointers, append=False): """ Takes a list of utterance IDs and a list of strings in the format "filename:offset", and writes them to a Kaldi script file. """ with smart_open(filename, "a" if append else "w") as f: for uttid, pointer in zip(uttids, pointers): f.write("%s %s\n" % (uttid, pointer))
def load_grammar(path, grammarfmt, transform): """ Load a WCFG from a file. :args path: path to the grammar (or prefix path to rules and lexicon) :args grammarfmt: 'bar', 'discodop' or 'milos' (which looks like 'bar' but with terminals surrounded by quotes) :returns: WCFG """ if grammarfmt == 'bar': istream = smart_open(path) grammar = wcfg.WCFG(wcfg.read_grammar_rules(istream, transform)) elif grammarfmt == 'milos': istream = smart_open(path) grammar = wcfg.WCFG(wcfg.read_grammar_rules(istream, transform, strip_quotes=True)) elif grammarfmt == 'discodop': grammar = discodopfmt.read_grammar('{0}.rules.gz'.format(path), '{0}.lex.gz'.format(path), transform) else: raise NotImplementedError("I don't know this grammar format: %s" % grammarfmt) return grammar
def readScp(filename, limit=numpy.inf, memmap_dir='', memmap_dtype='float32'): """ Reads the features in a Kaldi script file. Returns a list of feature matrices and a list of the utterance IDs. """ features = [] uttids = [] with smart_open(filename, "r") as f: for line in f: uttid, pointer = line.strip().split() p = pointer.rfind(":") arkfile, offset = pointer[:p], int(pointer[p + 1:]) with smart_open(arkfile, "rb") as g: g.seek(offset) feature = readMatrix(g) features.append(feature) uttids.append(uttid) if len(features) == limit: break return features, uttids
def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None): """ Store the input-hidden weight matrix in the same format used by the original C word2vec-tool, for compatibility. `fname` is the file used to save the vectors in `fvocab` is an optional file used to save the vocabulary `binary` is an optional boolean indicating whether the data is to be saved in binary word2vec format (default: False) `total_vec` is an optional parameter to explicitly specify total no. of vectors (in case word vectors are appended with document vectors afterwards) """ if total_vec is None: total_vec = len(self.vocab) vector_size = self.syn0.shape[1] if fvocab is not None: logger.info("storing vocabulary in %s" % (fvocab)) with utils.smart_open(fvocab, 'wb') as vout: for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count))) logger.info("storing %sx%s projection weights into %s" % (total_vec, vector_size, fname)) assert (len(self.vocab), vector_size) == self.syn0.shape with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size))) # store in sorted order: most frequent words at the top for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): row = self.syn0[vocab.index] if binary: fout.write(utils.to_utf8(word) + b" " + row.tostring()) else: fout.write( utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))
def __iter__(self): """Iterate through the lines in the source.""" try: # Assume it is a file-like object and try treating it as such # Things that don't have seek will trigger an exception self.source.seek(0) for item_no, line in enumerate(self.source): yield TaggedDocument(utils.to_unicode(line).split(), [item_no]) except AttributeError: # If it didn't work like a file, use it as a string filename with utils.smart_open(self.source) as fin: for item_no, line in enumerate(fin): yield TaggedDocument(utils.to_unicode(line).split(), [item_no])
def __iter__(self): """Iterate through the lines in the source.""" try: # Assume it is a file-like object and try treating it as such # Things that don't have seek will trigger an exception self.source.seek(0) for line in self.source: yield utils.to_unicode(line).split() except AttributeError: # If it didn't work like a file, use it as a string filename with utils.smart_open(self.source) as fin: for line in fin: yield utils.to_unicode(line).split()
def save_cat2vec_format(self, fname): """ Store cat vectors """ logger.info("storing %sx%s projection weights into %s" % (self.cat_len, self.layer1_size, fname)) assert (self.cat_len, self.layer1_size) == self.cats.shape with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8("#cats_len: %d\n#size:%d\n" % self.cats.shape)) fout.write(utils.to_utf8("#sg:%d\n#hs:%d\n#negative:%d\n#cbow_mean:%d\n" % (self.sg,self.hs,self.negative,self.cbow_mean))) for cat_id in self.cat_no_hash.keys(): row = self.cats[self.cat_no_hash[cat_id]] fout.write(utils.to_utf8("%s\t%s\n" % (cat_id, ' '.join("%f" % val for val in row))))
def output(html, out_filename): soup = BeautifulSoup(html, 'lxml') with utils.smart_open(out_filename, 'w', encoding='utf-8') as f: for p in soup.find_all('p'): bold = p.find( 'font', attrs={'style': 'FONT-WEIGHT: bold; FONT-SIZE: 13px;'}) if bold and bold.text.strip().endswith(':'): bold.decompose() if p.find('img'): continue text = p.text if text: print(text, file=f)
def save_word2vec_format(self, fname, fvocab=None, binary=False): """ Store the input-hidden weight matrix in the same format used by the original C word2vec-tool, for compatibility. """ if fvocab is not None: logger.info("Storing vocabulary in %s" % (fvocab)) with utils.smart_open(fvocab, 'wb') as vout: for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count))) logger.info("storing %sx%s projection weights into %s" % (len(self.vocab), self.layer1_size, fname)) assert (len(self.vocab), self.layer1_size) == self.syn0.shape with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % self.syn0.shape)) # store in sorted order: most frequent words at the top for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): row = self.syn0[vocab.index] if binary: fout.write(utils.to_utf8(word) + b" " + row.tostring()) else: fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))
def __iter__(self): """Iterate through the lines in the source.""" try: # Assume it is a file-like object and try treating it as such # Things that don't have seek will trigger an exception self.source.seek(0) for line in self.source: yield line.split() except AttributeError: # If it didn't work like a file, use it as a string filename with utils.smart_open(self.source) as fin: for line in fin: yield line.split()
def iterrules(path, transform): fi = smart_open(path) for line in fi: line = line.strip() if not line: continue fields = line.split() lhs = fields[0] num, den = fields[-1].split('/') num = float(num) den = float(den) rhs = fields[1:-2] # fields[-2] is the yield function, which we are ignoring yield Rule(make_nonterminal(lhs), [make_nonterminal(s) for s in rhs], transform(num/den))
def __iter__(self): # try: # self.source.seek(0) # for line in self.source: # yield utils.to_unicode(line).split() # # except AttributeError: # If it didn't work like a file, use it as a string filename with utils.smart_open(self.source) as fin: for line in fin: t3 = self.parse(line) if t3!=None: yield t3
def read_sentences(path): """ read file line by line and split words :param path: file to read :return: array of lines """ file = smart_open(path) sentences = [] for line in file.readlines(): tokens = line.strip().split(" ") tokens = list(filter(bool, tokens)) sentences.append(tokens) return sentences
def iterlexicon(path, transform): fi = smart_open(path) for line in fi: line = line.strip() if not line: continue fields = line.split('\t') word = fields[0] for pair in fields[1:]: tag, fraction = pair.split(' ') num, den = fraction.split('/') num = float(num) den = float(den) yield Rule(make_nonterminal(tag), [make_terminal(word)], transform(num/den))
def __iter__(self): if not self.split: try: self.source.seek(0) for line in self.source: k = utils.to_unicode(line.rstrip()).split("\t") yield k[self.cont_col:],k[self.sent_col],k[self.cat_col] except AttributeError: with utils.smart_open(self.source) as fin: for line in fin: k = utils.to_unicode(line.rstrip()).split("\t") yield k[self.cont_col:],k[self.sent_col],k[self.cat_col] else: if isinstance(self.source, list): split_files = self.source else: split_files = glob.glob(self.source+".[a-z][a-z]") if self.rand: random.shuffle(split_files) for source in split_files: with utils.smart_open(source) as fin: for line in fin: k = utils.to_unicode(line.rstrip()).split("\t") yield k[self.cont_col:],k[self.sent_col],k[self.cat_col]
def writeArk(filename, features, uttids, append=False): """ Takes a list of feature matrices and a list of utterance IDs, and writes them to a Kaldi ark file. Returns a list of strings in the format "filename:offset", which can be used to write a Kaldi script file. """ pointers = [] with smart_open(filename, "ab" if append else "wb") as f: for feature, uttid in zip(features, uttids): writeString(f, uttid.encode('utf-8')) pointers.append("%s:%d" % (filename, f.tell())) writeMatrix(f, feature) return pointers
def load_grammar(path, grammarfmt, transform): """ Load a WCFG from a file. :args path: path to the grammar (or prefix path to rules and lexicon) :args grammarfmt: 'bar', 'discodop' or 'milos' (which looks like 'bar' but with terminals surrounded by quotes) :returns: WCFG """ if grammarfmt == 'bar': istream = smart_open(path) grammar = wcfg.WCFG(wcfg.read_grammar_rules(istream, transform)) elif grammarfmt == 'milos': istream = smart_open(path) grammar = wcfg.WCFG( wcfg.read_grammar_rules(istream, transform, strip_quotes=True)) elif grammarfmt == 'discodop': grammar = discodopfmt.read_grammar('{0}.rules.gz'.format(path), '{0}.lex.gz'.format(path), transform) else: raise NotImplementedError("I don't know this grammar format: %s" % grammarfmt) return grammar
def iterlexicon(path, transform): fi = smart_open(path) for line in fi: line = line.strip() if not line: continue fields = line.split('\t') word = fields[0] for pair in fields[1:]: tag, fraction = pair.split(' ') num, den = fraction.split('/') num = float(num) den = float(den) yield Rule(make_nonterminal(tag), [make_terminal(word)], transform(num / den))
def __iter__(self): if not self.split: try: self.source.seek(0) for line in self.source: k = utils.to_unicode(line.rstrip()).split("\t") yield k[self.cont_col :], k[self.sent_col], k[self.cat_col] except AttributeError: with utils.smart_open(self.source) as fin: for line in fin: k = utils.to_unicode(line.rstrip()).split("\t") yield k[self.cont_col :], k[self.sent_col], k[self.cat_col] else: if isinstance(self.source, list): split_files = self.source else: split_files = glob.glob(self.source + ".[a-z][a-z]") if self.rand: random.shuffle(split_files) for source in split_files: with utils.smart_open(source) as fin: for line in fin: k = utils.to_unicode(line.rstrip()).split("\t") yield k[self.cont_col :], k[self.sent_col], k[self.cat_col]
def __iter__(self): for fname in os.listdir(self.dirname): fname = os.path.join(self.dirname, fname) if not os.path.isfile(fname): continue for line in utils.smart_open(fname): line = utils.to_unicode(line) # each file line is a single sentence in the Brown corpus # each token is WORD/POS_TAG token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2] # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff) words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()] if not words: # don't bother sending out empty sentences continue yield words
def iterrules(path, transform): fi = smart_open(path) for line in fi: line = line.strip() if not line: continue fields = line.split() lhs = fields[0] num, den = fields[-1].split('/') num = float(num) den = float(den) rhs = fields[ 1:-2] # fields[-2] is the yield function, which we are ignoring yield Rule(make_nonterminal(lhs), [make_nonterminal(s) for s in rhs], transform(num / den))
def __iter__(self): for fname in os.listdir(self.dirname): fname = os.path.join(self.dirname, fname) if not os.path.isfile(fname): continue for item_no, line in enumerate(utils.smart_open(fname)): line = utils.to_unicode(line) # each file line is a single document in the Brown corpus # each token is WORD/POS_TAG token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2] # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff) words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()] if not words: # don't bother sending out empty documents continue yield TaggedDocument(words, ['%s_SENT_%s' % (fname, item_no)])
def __iter__(self): try: self.source.seek(0) for line in self.source: k = utils.to_unicode(line.rstrip()).split("\t") categories = k[3].split(" ") for cat in categories: if "/" in cat: continue yield k[4:], k[1], cat except AttributeError: with utils.smart_open(self.source) as fin: for line in fin: k = utils.to_unicode(line.rstrip()).split("\t") categories = k[3].split(" ") for cat in categories: if "/" in cat: continue yield k[4:], k[1], cat
def __iter__(self): # the entire corpus is one gigantic line -- there are no sentence marks at all # so just split the sequence of tokens arbitrarily: 1 sentence = 1000 tokens sentence, rest, max_sentence_length = [], b'', 1000 with utils.smart_open(self.fname) as fin: while True: text = rest + fin.read(8192) # avoid loading the entire file (=1 line) into RAM if text == rest: # EOF sentence.extend(rest.split()) # return the last chunk of words, too (may be shorter/longer) if sentence: yield sentence break last_token = text.rfind(b' ') # the last token may have been split in two... keep it for the next iteration words, rest = (utils.to_unicode(text[:last_token]).split(), text[last_token:].strip()) if last_token >= 0 else ([], text) sentence.extend(words) while len(sentence) >= max_sentence_length: yield sentence[:max_sentence_length] sentence = sentence[max_sentence_length:]
def save_doc2vec_format(self, fname): """ Store the input-hidden weight matrix in the same format used by the original C word2vec-tool, for compatibility. """ logger.info("storing %sx%s projection weights into %s" % (self.sents_len, self.layer1_size, fname)) assert (self.sents_len, self.layer1_size) == self.sents.shape with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % self.sents.shape)) # store in sorted order: most frequent words at the top for sent_no in xrange(self.sents_len): row = self.sents[sent_no] fout.write( utils.to_utf8("sent_%d %s\n" % (sent_no, ' '.join("%f" % val for val in row))))
def main(): utils.unicodefy_std_io() parser = argparse.ArgumentParser( description='Generates a pair of DSA keys') parser.add_argument('--version', action='version', version='DSA Key Generator {}'.format(__version__)) parser.add_argument( '-o', '--output', default=utils.STD_IO_MARK, help= 'where to save the generated keys pair, default "%(default)s" means print keys to stdout' ) parser.add_argument( '-b', '--bits', type=int, default=1024, help= 'the length of the prime to be generated in bits (default: %(default)s)' ) parser.add_argument( '--cipher', help= 'name of symmetric key algorithm and mode to encrypt the private key, such as aes_128_cbc' ) parser.add_argument( '--passphrase', help= 'a password used to protect the private key when using `cipher`. If not given, ' 'you might be asked to enter password during generation process.') unicode_args = map(lambda s: unicode(s, sys.getfilesystemencoding()), sys.argv) args = parser.parse_args(unicode_args[1:]) dsa_key = DsaKey(args.bits) with utils.smart_open(args.output, mode='wb') as f: f.write( dsa_key.get_private_key(cipher=args.cipher, pass_phrase=args.passphrase)) f.write(dsa_key.get_public_key())
def save_cat2vec_format(self, fname): """ Store cat vectors """ logger.info("storing %sx%s projection weights into %s" % (self.cat_len, self.layer1_size, fname)) assert (self.cat_len, self.layer1_size) == self.cats.shape with utils.smart_open(fname, 'wb') as fout: fout.write( utils.to_utf8("#cats_len: %d\n#size:%d\n" % self.cats.shape)) fout.write( utils.to_utf8( "#sg:%d\n#hs:%d\n#negative:%d\n#cbow_mean:%d\n" % (self.sg, self.hs, self.negative, self.cbow_mean))) for cat_id in self.cat_no_hash.keys(): row = self.cats[self.cat_no_hash[cat_id]] fout.write( utils.to_utf8("%s\t%s\n" % (cat_id, ' '.join("%f" % val for val in row))))
def save_as_text(self, fname, sort_by_word=True): """ Save this Dictionary to a text file, in format: `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Sorted by word, or by decreasing word frequency. Note: text format should be use for corpus inspection. Use `save`/`load` to store in binary format (pickle) for improved performance. """ logger.info("saving dictionary mapping to %s", fname) with utils.smart_open(fname, 'wb') as fout: if sort_by_word: for token, tokenid in sorted(iteritems(self.token2id)): line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0)) fout.write(utils.to_utf8(line)) else: for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]): line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq) fout.write(utils.to_utf8(line))
def load_from_text(fname): """ Load a previously stored Dictionary from a text file. Mirror function to `save_as_text`. """ result = Dictionary() with utils.smart_open(fname) as f: for lineno, line in enumerate(f): line = utils.to_unicode(line) try: wordid, word, docfreq = line[:-1].split('\t') except Exception: raise ValueError("invalid line in dictionary file %s: %s" % (fname, line.strip())) wordid = int(wordid) if word in result.token2id: raise KeyError( 'token %s is defined as ID %d and as ID %d' % (word, wordid, result.token2id[word])) result.token2id[word] = wordid result.dfs[wordid] = int(docfreq) return result
def main(): utils.unicodefy_std_io() parser = argparse.ArgumentParser( description='Generates an atlassian license') parser.add_argument( '--version', action='version', version='Atlassian License Generator {}'.format(__version__)) parser.add_argument( 'template', help= 'path to a license template yaml file, such as `templates/jira.yml`') parser.add_argument('organisation', help='your company name used to register the product') parser.add_argument( 'server_id', nargs='?', help='server id, usually in format of `ABCD-1234-EFGH-5678`') parser.add_argument( '-o', '--output', default=utils.STD_IO_MARK, help='where to save the generated license, default "%(default)s" means ' 'stdout') parser.add_argument( '--show-raw', action='store_true', help='also prints raw (not encoded) license content to stderr') parser.add_argument( '-k', '--key', '--private-key', default='calfzhou.pem', help='a key file (contains at least a private DSA key) used to sign the ' 'license (default: %(default)s)') parser.add_argument( '--passphrase', help= 'password used by the private key. If not given, you might be asked ' 'to enter it when needed.') def parse_variable_definition(text): parts = text.split('=', 1) if len(parts) < 2: raise argparse.ArgumentTypeError( 'unrecognized variable definition "{}"'.format(text)) return tuple(parts) group = parser.add_argument_group( 'customizing license arguments', 'use these arguments to over-write default license template ' 'or variables') group.add_argument( '-v', '--var', action='append', type=parse_variable_definition, help='custom variable used by template, e.g. -v number_of_users=200') unicode_args = map(lambda s: unicode(s, sys.getfilesystemencoding()), sys.argv) args = parser.parse_args(unicode_args[1:]) custom_variables = None if args.var: custom_variables = {key: value for key, value in args.var} generator = AtlanssianLicenseGenerator(args.template) atlassian_license = generator.generate(args.organisation, args.server_id, custom_variables) if args.show_raw: print(atlassian_license, file=sys.stderr) encoder = AtlassianLicenseEncoder(args.key, args.passphrase) encoded_license = encoder.encode(atlassian_license) with utils.smart_open(args.output, mode='wb') as f: f.write(encoded_license)
def __init__(self, fname): self.fname = fname if fname.endswith(".gz") or fname.endswith('.bz2'): raise NotImplementedError("compressed output not supported with MmWriter") self.fout = utils.smart_open(self.fname, 'wb+') # open for both reading and writing self.headers_written = False
def load_word2vec_format(cls, fname, fvocab=None, binary=False, norm_only=True): """ Load the input-hidden weight matrix from the original C word2vec-tool format. Note that the information stored in the file is incomplete (the binary tree is missing), so while you can query for word similarity etc., you cannot continue training with a model loaded this way. `binary` is a boolean indicating whether the data is in binary word2vec format. `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory. Word counts are read from `fvocab` filename, if set (this is the file generated by `-save-vocab` flag of the original C tool). """ counts = None if fvocab is not None: logger.info("loading word counts from %s" % (fvocab)) counts = {} with utils.smart_open(fvocab) as fin: for line in fin: word, count = utils.to_unicode(line).strip().split() counts[word] = int(count) logger.info("loading projection weights from %s" % (fname)) with utils.smart_open(fname) as fin: header = utils.to_unicode(fin.readline()) vocab_size, layer1_size = map(int, header.split()) # throws for invalid file format result = Word2Vec(size=layer1_size) result.syn0 = zeros((vocab_size, layer1_size), dtype=REAL) if binary: binary_len = dtype(REAL).itemsize * layer1_size for line_no in xrange(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: ch = fin.read(1) if ch == b' ': break if ch != b'\n': # ignore newlines in front of words (some binary files have newline, some don't) word.append(ch) word = utils.to_unicode(b''.join(word)) if counts is None: result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no) elif word in counts: result.vocab[word] = Vocab(index=line_no, count=counts[word]) else: logger.warning("vocabulary file is incomplete") result.vocab[word] = Vocab(index=line_no, count=None) result.index2word.append(word) result.syn0[line_no] = fromstring(fin.read(binary_len), dtype=REAL) else: for line_no, line in enumerate(fin): parts = utils.to_unicode(line).split() if len(parts) != layer1_size + 1: raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no)) word, weights = parts[0], map(REAL, parts[1:]) if counts is None: result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no) elif word in counts: result.vocab[word] = Vocab(index=line_no, count=counts[word]) else: logger.warning("vocabulary file is incomplete") result.vocab[word] = Vocab(index=line_no, count=None) result.index2word.append(word) result.syn0[line_no] = weights logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname)) result.init_sims(norm_only) return result
def accuracy(self, questions, restrict_vocab=30000): """ Compute accuracy of the model. `questions` is a filename where lines are 4-tuples of words, split into sections by ": SECTION NAME" lines. See https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt for an example. The accuracy is reported (=printed to log and returned as a list) for each section separately, plus there's one aggregate summary at the end. Use `restrict_vocab` to ignore all questions containing a word whose frequency is not in the top-N most frequent words (default top 30,000). This method corresponds to the `compute-accuracy` script of the original C word2vec. """ ok_vocab = dict(sorted(iteritems(self.vocab), key=lambda item: -item[1].count)[:restrict_vocab]) ok_index = set(v.index for v in itervalues(ok_vocab)) def log_accuracy(section): correct, incorrect = section['correct'], section['incorrect'] if correct + incorrect > 0: logger.info("%s: %.1f%% (%i/%i)" % (section['section'], 100.0 * correct / (correct + incorrect), correct, correct + incorrect)) sections, section = [], None for line_no, line in enumerate(utils.smart_open(questions)): # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed line = utils.to_unicode(line) if line.startswith(': '): # a new section starts => store the old section if section: sections.append(section) log_accuracy(section) section = {'section': line.lstrip(': ').strip(), 'correct': 0, 'incorrect': 0} else: if not section: raise ValueError("missing section header before line #%i in %s" % (line_no, questions)) try: a, b, c, expected = [word.lower() for word in line.split()] # TODO assumes vocabulary preprocessing uses lowercase, too... except: logger.info("skipping invalid line #%i in %s" % (line_no, questions)) if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: logger.debug("skipping line #%i with OOV words: %s" % (line_no, line)) continue ignore = set(self.vocab[v].index for v in [a, b, c]) # indexes of words to ignore predicted = None # find the most likely prediction, ignoring OOV words and input words for index in argsort(self.most_similar(positive=[b, c], negative=[a], topn=False))[::-1]: if index in ok_index and index not in ignore: predicted = self.index2word[index] if predicted != expected: logger.debug("%s: expected %s, predicted %s" % (line.strip(), expected, predicted)) break section['correct' if predicted == expected else 'incorrect'] += 1 if section: # store the last section, too sections.append(section) log_accuracy(section) total = {'section': 'total', 'correct': sum(s['correct'] for s in sections), 'incorrect': sum(s['incorrect'] for s in sections)} log_accuracy(total) sections.append(total) return sections
def load_word2vec_format( cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', limit=None, datatype=REAL): """ Load the input-hidden weight matrix from the original C word2vec-tool format. Note that the information stored in the file is incomplete (the binary tree is missing), so while you can query for word similarity etc., you cannot continue training with a model loaded this way. `binary` is a boolean indicating whether the data is in binary word2vec format. `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory. Word counts are read from `fvocab` filename, if set (this is the file generated by `-save-vocab` flag of the original C tool). If you trained the C model using non-utf8 encoding for words, specify that encoding in `encoding`. `unicode_errors`, default 'strict', is a string suitable to be passed as the `errors` argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source file may include word tokens truncated in the middle of a multibyte unicode character (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help. `limit` sets a maximum number of word-vectors to read from the file. The default, None, means read all. `datatype` (experimental) can coerce dimensions to a non-default float type (such as np.float16) to save memory. (Such types may result in much slower bulk operations or incompatibility with optimized routines.) """ counts = None if fvocab is not None: print("loading word counts from %s" % fvocab) counts = {} with utils.smart_open(fvocab) as fin: for line in fin: word, count = utils.to_unicode(line).strip().split() counts[word] = int(count) print("loading projection weights from %s" % fname) with utils.smart_open(fname) as fin: header = utils.to_unicode(fin.readline(), encoding=encoding) # throws for invalid file format vocab_size, vector_size = (int(x) for x in header.split()) if limit: vocab_size = min(vocab_size, limit) result = cls() result.vector_size = vector_size result.syn0 = zeros((vocab_size, vector_size), dtype=datatype) def add_word(word, weights): word_id = len(result.vocab) # print("word id: %d, word: %s, weights: %s" % (word_id, word, weights)) if word in result.vocab: print( "duplicate word '%s' in %s, ignoring all but first" % (word, fname)) return if counts is None: # most common scenario: no vocab file given. just make up # some bogus counts, in descending order result.vocab[word] = Vocab( index=word_id, count=vocab_size - word_id) elif word in counts: # use count from the vocab file result.vocab[word] = Vocab( index=word_id, count=counts[word]) else: # vocab file given, but word is missing -- set count to # None (TODO: or raise?) print( "vocabulary file is incomplete: '%s' is missing" % word) result.vocab[word] = Vocab(index=word_id, count=None) result.syn0[word_id] = weights result.index2word.append(word) if binary: binary_len = dtype(REAL).itemsize * vector_size for _ in xrange(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: ch = fin.read(1) if ch == b' ': break if ch == b'': raise EOFError( "unexpected end of input; is count incorrect or file otherwise damaged?") # ignore newlines in front of words (some binary files # have) if ch != b'\n': word.append(ch) word = utils.to_unicode( b''.join(word), encoding=encoding, errors=unicode_errors) weights = fromstring(fin.read(binary_len), dtype=REAL) add_word(word, weights) else: for line_no in xrange(vocab_size): line = fin.readline() if line == b'': raise EOFError( "unexpected end of input; is count incorrect or file otherwise damaged?") parts = utils.to_unicode( line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") if len(parts) != vector_size + 1: raise ValueError( "invalid vector on line %s (is this really the text format?)" % line_no) word, weights = parts[0], [REAL(x) for x in parts[1:]] add_word(word, weights) if result.syn0.shape[0] != len(result.vocab): print( "duplicate words detected, shrinking matrix size from %i to %i" % (result.syn0.shape[0], len(result.vocab))) result.syn0 = ascontiguousarray(result.syn0[: len(result.vocab)]) assert (len(result.vocab), vector_size) == result.syn0.shape print("loaded %s matrix from %s" % (result.syn0.shape, fname)) return result
def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', limit=None, datatype=REAL): """ Load the input-hidden weight matrix from the original C word2vec-tool format. Note that the information stored in the file is incomplete (the binary tree is missing), so while you can query for word similarity etc., you cannot continue training with a model loaded this way. `binary` is a boolean indicating whether the data is in binary word2vec format. `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory. Word counts are read from `fvocab` filename, if set (this is the file generated by `-save-vocab` flag of the original C tool). If you trained the C model using non-utf8 encoding for words, specify that encoding in `encoding`. `unicode_errors`, default 'strict', is a string suitable to be passed as the `errors` argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source file may include word tokens truncated in the middle of a multibyte unicode character (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help. `limit` sets a maximum number of word-vectors to read from the file. The default, None, means read all. `datatype` (experimental) can coerce dimensions to a non-default float type (such as np.float16) to save memory. (Such types may result in much slower bulk operations or incompatibility with optimized routines.) """ counts = None if fvocab is not None: logging.debug("loading word counts from %s" % fvocab) counts = {} with utils.smart_open(fvocab) as fin: for line in fin: word, count = utils.to_unicode(line).strip().split() counts[word] = int(count) logging.debug("loading projection weights from %s" % fname) with utils.smart_open(fname) as fin: header = utils.to_unicode(fin.readline(), encoding=encoding) # throws for invalid file format vocab_size, vector_size = (int(x) for x in header.split()) if limit: vocab_size = min(vocab_size, limit) result = cls() result.vector_size = vector_size result.syn0 = zeros((vocab_size, vector_size), dtype=datatype) def add_word(word, weights): word_id = len(result.vocab) # logging.debug("word id: %d, word: %s, weights: %s" % (word_id, word, weights)) if word in result.vocab: logging.debug( "duplicate word '%s' in %s, ignoring all but first" % (word, fname)) return if counts is None: # most common scenario: no vocab file given. just make up # some bogus counts, in descending order result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id) elif word in counts: # use count from the vocab file result.vocab[word] = Vocab(index=word_id, count=counts[word]) else: # vocab file given, but word is missing -- set count to # None (TODO: or raise?) logging.debug( "vocabulary file is incomplete: '%s' is missing" % word) result.vocab[word] = Vocab(index=word_id, count=None) result.syn0[word_id] = weights result.index2word.append(word) if binary: binary_len = dtype(REAL).itemsize * vector_size for _ in xrange(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: ch = fin.read(1) if ch == b' ': break if ch == b'': raise EOFError( "unexpected end of input; is count incorrect or file otherwise damaged?" ) # ignore newlines in front of words (some binary files # have) if ch != b'\n': word.append(ch) word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors) weights = fromstring(fin.read(binary_len), dtype=REAL) add_word(word, weights) else: for line_no in xrange(vocab_size): line = fin.readline() if line == b'': raise EOFError( "unexpected end of input; is count incorrect or file otherwise damaged?" ) parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") if len(parts) != vector_size + 1: raise ValueError( "invalid vector on line %s (is this really the text format?)" % line_no) word, weights = parts[0], [REAL(x) for x in parts[1:]] add_word(word, weights) if result.syn0.shape[0] != len(result.vocab): logging.debug( "duplicate words detected, shrinking matrix size from %i to %i" % (result.syn0.shape[0], len(result.vocab))) result.syn0 = ascontiguousarray(result.syn0[:len(result.vocab)]) assert (len(result.vocab), vector_size) == result.syn0.shape ''' KDTree Build KDTree with vectors. http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KDTree.html#sklearn.neighbors.KDTree ''' result.kdt = KDTree(result.syn0, leaf_size=10, metric="euclidean") logging.debug("loaded %s matrix from %s" % (result.syn0.shape, fname)) return result
def get_tokens(self, words): for i in range(1, len(self.line)): if not words.get(self.line[i - 1]): words[self.line[i - 1]] = {self.line[i]: 1} elif not words[self.line[i - 1]].get(self.line[i]): words[self.line[i - 1]][self.line[i]] = 1 else: words[self.line[i - 1]][self.line[i]] += 1 return words parser = argparse.ArgumentParser( description='A script which collects words from file') parser.add_argument('--input-dir', dest='directory', type=str, default='stdin', help='File directory') parser.add_argument('--model', required=True, type=str, help='Save file') parser.add_argument('--lc', action='store_true', help='Switch to lowercase') args = parser.parse_args() if __name__ == '__main__': words = {} with utils.smart_open(args.directory, "r") as fin: for line in utils.all_files_generator(fin): p = Parser(line) p.preprocess(args.lc) p.get_tokens(words) utils.dump_dictionary(args.model, words)