Exemplo n.º 1
0
    def save_word2vec_format(self, fname, fvocab=None, binary=False):
        """
        Store the input-hidden weight matrix in the same format used by the original
        C word2vec-tool, for compatibility.

        """
        if fvocab is not None:
            logger.info("Storing vocabulary in %s" % (fvocab))
            with utils.smart_open(fvocab, 'wb') as vout:
                for word, vocab in sorted(iteritems(self.vocab),
                                          key=lambda item: -item[1].count):
                    vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count)))
        logger.info("storing %sx%s projection weights into %s" %
                    (len(self.vocab), self.layer1_size, fname))
        assert (len(self.vocab), self.layer1_size) == self.syn0.shape
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(utils.to_utf8("%s %s\n" % self.syn0.shape))
            # store in sorted order: most frequent words at the top
            for word, vocab in sorted(iteritems(self.vocab),
                                      key=lambda item: -item[1].count):
                row = self.syn0[vocab.index]
                if binary:
                    fout.write(utils.to_utf8(word) + b" " + row.tostring())
                else:
                    fout.write(
                        utils.to_utf8("%s %s\n" %
                                      (word, ' '.join("%f" % val
                                                      for val in row))))
Exemplo n.º 2
0
def main():
    utils.unicodefy_std_io()

    parser = argparse.ArgumentParser(
        description='Decodes (and verifies) an atlassian license')
    parser.add_argument(
        '--version',
        action='version',
        version='Atlassian License Decoder {}'.format(__version__))
    parser.add_argument(
        '-k',
        '--key',
        '--public-key',
        default='atlassian.pem',
        help=
        'a key file (contains at least a public DSA key) used to verify license (default: '
        '%(default)s)')
    parser.add_argument('-V',
                        '--no-verify',
                        dest='verify',
                        action='store_false',
                        help='skip license verification step')
    parser.add_argument(
        '-i',
        '--input',
        default=utils.STD_IO_MARK,
        help='from where to read license, default "%(default)s" means stdin')
    parser.add_argument(
        '-o',
        '--output',
        default=utils.STD_IO_MARK,
        help=
        'where to save the decoded license, default "%(default)s" means stdout'
    )

    unicode_args = map(lambda s: unicode(s, sys.getfilesystemencoding()),
                       sys.argv)
    args = parser.parse_args(unicode_args[1:])

    with utils.smart_open(args.input, mode='rb') as f:
        atlassian_license = f.read()

    decoder = AtlassianLicenseDecoder(args.key)
    decompressed_content, verified = decoder.decode(atlassian_license,
                                                    need_verify=args.verify)

    with utils.smart_open(args.output, mode='wb') as f:
        f.write(decompressed_content)

    if verified is not None and not verified:
        print(
            '\nWARNING: the license can NOT be verified by the given public key',
            file=sys.stderr)
Exemplo n.º 3
0
Arquivo: cpl.py Projeto: guy-david/cpl
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('input_file', nargs='+', help='Input files')
    parser.add_argument('-o', '--output-file', default='-', help='Output path')
    args = parser.parse_args()

    with utils.smart_open(args.output_file, 'w') as output_file:
        for input_path in args.input_file:
            with utils.smart_open(input_path, 'r') as input_file:
                parser = Parser(input_file)
                stmts = parser.parse()

                code_gen = CodeGenerator('quad')
                code_gen.gen(stmts)
Exemplo n.º 4
0
def main():
    params = argparse.ArgumentParser(description='Alignment CLI')
    add_parameters(params)
    args = params.parse_args()

    trg_lengths = [len(x) for x in read_sentences(args.target)]
    src_lengths = [len(x) for x in read_sentences(args.source)]

    alignments, is_multiline = read_alignment_file(path=args.alignment,
                                                   trg_lengths=trg_lengths,
                                                   src_lengths=src_lengths)

    alignments = process_alignments(
        alignments=alignments,
        unaligned_target=args.unaligned_target,
        multiply_aligned_target=args.multiply_aligned_target,
        eps_index=args.unaligned_target_epsilon_index,
        bbn_multiply_aligned_target=args.bbn_multiply_aligned_target,
        bbn_unaligned_target=args.bbn_unaligned_target)

    if args.output_format is None:
        flat_output = not is_multiline
    else:
        flat_output = True if args.output_format == "flat" else False

    output_stream = sys.stdout if args.output is None else smart_open(
        args.output, mode='wt')
    print_alignments(alignments=alignments,
                     stream=output_stream,
                     print_unaligned_target=True
                     if args.unaligned_target == "keep" else False,
                     eps_index=args.unaligned_target_epsilon_index,
                     flat=flat_output)
Exemplo n.º 5
0
 def __iter__(self):
     # the entire corpus is one gigantic line -- there are no sentence marks at all
     # so just split the sequence of tokens arbitrarily: 1 sentence = 1000 tokens
     sentence, rest, max_sentence_length = [], b'', 1000
     with utils.smart_open(self.fname) as fin:
         while True:
             text = rest + fin.read(
                 8192)  # avoid loading the entire file (=1 line) into RAM
             if text == rest:  # EOF
                 sentence.extend(
                     rest.split()
                 )  # return the last chunk of words, too (may be shorter/longer)
                 if sentence:
                     yield sentence
                 break
             last_token = text.rfind(
                 b' '
             )  # the last token may have been split in two... keep it for the next iteration
             words, rest = (
                 utils.to_unicode(text[:last_token]).split(),
                 text[last_token:].strip()) if last_token >= 0 else ([],
                                                                     text)
             sentence.extend(words)
             while len(sentence) >= max_sentence_length:
                 yield sentence[:max_sentence_length]
                 sentence = sentence[max_sentence_length:]
Exemplo n.º 6
0
    def docbyoffset(self, offset):
        """Return document at file offset `offset` (in bytes)"""
        # empty documents are not stored explicitly in MM format, so the index marks
        # them with a special offset, -1.
        if offset == -1:
            return []
        if isinstance(self.input, string_types):
            fin = utils.smart_open(self.input)
        else:
            fin = self.input

        fin.seek(offset)  # works for gzip/bz2 input, too
        previd, document = -1, []
        for line in fin:
            docid, termid, val = line.split()
            if not self.transposed:
                termid, docid = docid, termid
            docid, termid, val = int(docid) - 1, int(termid) - 1, float(
                val
            )  # -1 because matrix market indexes are 1-based => convert to 0-based
            assert previd <= docid, "matrix columns must come in ascending order"
            if docid != previd:
                if previd >= 0:
                    return document
                previd = docid

            document.append((
                termid,
                val,
            ))  # add another field to the current document
        return document
Exemplo n.º 7
0
def read_alignment_file(path, trg_lengths, src_lengths):
    """
    read flat alignment file
    :param path: path to alignment file
    :param trg_lengths: array of target lengths (for each sentence)
    :param src_lengths: array of source lengths (for each sentence)
    :return: array of alignments (unprocessed)
    """
    check_condition(
        len(trg_lengths) == len(src_lengths),
        "source and target sentences must be parallel")
    file = smart_open(path)
    content = file.readlines()
    if len(content) == len(trg_lengths):
        is_multiline = False
        alignments = _read_flat_alignment_file(content=content,
                                               trg_lengths=trg_lengths)
    else:
        is_multiline = True
        alignments = _read_multiline_alignment_file(content=content,
                                                    trg_lengths=trg_lengths)

    check_condition(
        len(alignments) == len(trg_lengths), "alignment mst be parallel")
    return alignments, is_multiline
Exemplo n.º 8
0
def readArk(filename, limit=numpy.inf, memmap_dir='', memmap_dtype='float32'):
    """
    Reads the features in a Kaldi ark file.
    Returns a list of feature matrices and a list of the utterance IDs.
    """
    features = []
    uttids = []
    with smart_open(filename, "rb") as f:
        while True:
            try:
                uttid = readString(f).decode('utf-8')
            except ValueError:
                break
            feature = readMatrix(f)
            # use a memmap dir to hold the array content on a disk (e.g. ssd cache that is larger than your main memory)
            if memmap_dir != '':
                feature_mmap = numpy.memmap(memmap_dir + '/' + uttid,
                                            dtype=memmap_dtype,
                                            mode='w+',
                                            shape=feature.shape)
                feature_mmap[:] = feature[:]
                feature_mmap.flush()
                features.append(feature_mmap)
                del feature
            else:
                features.append(feature)
            uttids.append(uttid)
            if len(features) == limit: break
    if memmap_dir != '':
        with io.open(memmap_dir + '/' + 'feature_map', 'w') as feature_map:
            for uttid, feature in zip(uttids, features):
                feature_map.write(uttid + " %i %i\n" %
                                  (feature.shape[0], feature.shape[1]))
    return features, uttids
Exemplo n.º 9
0
    def docbyoffset(self, offset):
        """Return document at file offset `offset` (in bytes)"""
        # empty documents are not stored explicitly in MM format, so the index marks
        # them with a special offset, -1.
        if offset == -1:
            return []
        if isinstance(self.input, string_types):
            fin = utils.smart_open(self.input)
        else:
            fin = self.input

        fin.seek(offset) # works for gzip/bz2 input, too
        previd, document = -1, []
        for line in fin:
            docid, termid, val = line.split()
            if not self.transposed:
                termid, docid = docid, termid
            docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) # -1 because matrix market indexes are 1-based => convert to 0-based
            assert previd <= docid, "matrix columns must come in ascending order"
            if docid != previd:
                if previd >= 0:
                    return document
                previd = docid

            document.append((termid, val,)) # add another field to the current document
        return document
Exemplo n.º 10
0
 def __init__(self, fname):
     self.fname = fname
     if fname.endswith(".gz") or fname.endswith('.bz2'):
         raise NotImplementedError(
             "compressed output not supported with MmWriter")
     self.fout = utils.smart_open(
         self.fname, 'wb+')  # open for both reading and writing
     self.headers_written = False
Exemplo n.º 11
0
def writeScp(filename, uttids, pointers, append=False):
    """
    Takes a list of utterance IDs and a list of strings in the format "filename:offset",
      and writes them to a Kaldi script file.
    """
    with smart_open(filename, "a" if append else "w") as f:
        for uttid, pointer in zip(uttids, pointers):
            f.write("%s %s\n" % (uttid, pointer))
def load_grammar(path, grammarfmt, transform):
    """
    Load a WCFG from a file.

    :args path: path to the grammar (or prefix path to rules and lexicon)
    :args grammarfmt: 'bar',  'discodop' or 'milos' (which looks like 'bar' but with terminals surrounded by quotes)
    :returns: WCFG
    """
    if grammarfmt == 'bar':
        istream = smart_open(path)
        grammar = wcfg.WCFG(wcfg.read_grammar_rules(istream, transform))
    elif grammarfmt == 'milos':
        istream = smart_open(path)
        grammar = wcfg.WCFG(wcfg.read_grammar_rules(istream, transform, strip_quotes=True))
    elif grammarfmt == 'discodop':
        grammar = discodopfmt.read_grammar('{0}.rules.gz'.format(path), '{0}.lex.gz'.format(path), transform)
    else:
        raise NotImplementedError("I don't know this grammar format: %s" % grammarfmt)
    return grammar
Exemplo n.º 13
0
def readScp(filename, limit=numpy.inf, memmap_dir='', memmap_dtype='float32'):
    """
    Reads the features in a Kaldi script file.
    Returns a list of feature matrices and a list of the utterance IDs.
    """
    features = []
    uttids = []
    with smart_open(filename, "r") as f:
        for line in f:
            uttid, pointer = line.strip().split()
            p = pointer.rfind(":")
            arkfile, offset = pointer[:p], int(pointer[p + 1:])
            with smart_open(arkfile, "rb") as g:
                g.seek(offset)
                feature = readMatrix(g)
            features.append(feature)
            uttids.append(uttid)
            if len(features) == limit: break
    return features, uttids
Exemplo n.º 14
0
    def save_word2vec_format(self,
                             fname,
                             fvocab=None,
                             binary=False,
                             total_vec=None):
        """
        Store the input-hidden weight matrix in the same format used by the original
        C word2vec-tool, for compatibility.

         `fname` is the file used to save the vectors in
         `fvocab` is an optional file used to save the vocabulary
         `binary` is an optional boolean indicating whether the data is to be saved
         in binary word2vec format (default: False)
         `total_vec` is an optional parameter to explicitly specify total no. of vectors
         (in case word vectors are appended with document vectors afterwards)

        """
        if total_vec is None:
            total_vec = len(self.vocab)
        vector_size = self.syn0.shape[1]
        if fvocab is not None:
            logger.info("storing vocabulary in %s" % (fvocab))
            with utils.smart_open(fvocab, 'wb') as vout:
                for word, vocab in sorted(iteritems(self.vocab),
                                          key=lambda item: -item[1].count):
                    vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count)))
        logger.info("storing %sx%s projection weights into %s" %
                    (total_vec, vector_size, fname))
        assert (len(self.vocab), vector_size) == self.syn0.shape
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
            # store in sorted order: most frequent words at the top
            for word, vocab in sorted(iteritems(self.vocab),
                                      key=lambda item: -item[1].count):
                row = self.syn0[vocab.index]
                if binary:
                    fout.write(utils.to_utf8(word) + b" " + row.tostring())
                else:
                    fout.write(
                        utils.to_utf8("%s %s\n" %
                                      (word, ' '.join("%f" % val
                                                      for val in row))))
Exemplo n.º 15
0
 def __iter__(self):
     """Iterate through the lines in the source."""
     try:
         # Assume it is a file-like object and try treating it as such
         # Things that don't have seek will trigger an exception
         self.source.seek(0)
         for item_no, line in enumerate(self.source):
             yield TaggedDocument(utils.to_unicode(line).split(), [item_no])
     except AttributeError:
         # If it didn't work like a file, use it as a string filename
         with utils.smart_open(self.source) as fin:
             for item_no, line in enumerate(fin):
                 yield TaggedDocument(utils.to_unicode(line).split(), [item_no])
Exemplo n.º 16
0
 def __iter__(self):
     """Iterate through the lines in the source."""
     try:
         # Assume it is a file-like object and try treating it as such
         # Things that don't have seek will trigger an exception
         self.source.seek(0)
         for line in self.source:
             yield utils.to_unicode(line).split()
     except AttributeError:
         # If it didn't work like a file, use it as a string filename
         with utils.smart_open(self.source) as fin:
             for line in fin:
                 yield utils.to_unicode(line).split()
Exemplo n.º 17
0
    def save_cat2vec_format(self, fname):
        """
        Store cat vectors

        """
        logger.info("storing %sx%s projection weights into %s" % (self.cat_len, self.layer1_size, fname))
        assert (self.cat_len, self.layer1_size) == self.cats.shape
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(utils.to_utf8("#cats_len: %d\n#size:%d\n" % self.cats.shape))
            fout.write(utils.to_utf8("#sg:%d\n#hs:%d\n#negative:%d\n#cbow_mean:%d\n" % (self.sg,self.hs,self.negative,self.cbow_mean)))
            for cat_id in self.cat_no_hash.keys():
                row = self.cats[self.cat_no_hash[cat_id]]
                fout.write(utils.to_utf8("%s\t%s\n" % (cat_id, ' '.join("%f" % val for val in row))))
Exemplo n.º 18
0
def output(html, out_filename):
    soup = BeautifulSoup(html, 'lxml')
    with utils.smart_open(out_filename, 'w', encoding='utf-8') as f:
        for p in soup.find_all('p'):
            bold = p.find(
                'font', attrs={'style': 'FONT-WEIGHT: bold; FONT-SIZE: 13px;'})
            if bold and bold.text.strip().endswith(':'):
                bold.decompose()
            if p.find('img'):
                continue
            text = p.text
            if text:
                print(text, file=f)
Exemplo n.º 19
0
    def save_word2vec_format(self, fname, fvocab=None, binary=False):
        """
        Store the input-hidden weight matrix in the same format used by the original
        C word2vec-tool, for compatibility.

        """
        if fvocab is not None:
            logger.info("Storing vocabulary in %s" % (fvocab))
            with utils.smart_open(fvocab, 'wb') as vout:
                for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
                    vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count)))
        logger.info("storing %sx%s projection weights into %s" % (len(self.vocab), self.layer1_size, fname))
        assert (len(self.vocab), self.layer1_size) == self.syn0.shape
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(utils.to_utf8("%s %s\n" % self.syn0.shape))
            # store in sorted order: most frequent words at the top
            for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
                row = self.syn0[vocab.index]
                if binary:
                    fout.write(utils.to_utf8(word) + b" " + row.tostring())
                else:
                    fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))
Exemplo n.º 20
0
 def __iter__(self):
     """Iterate through the lines in the source."""
     try:
         # Assume it is a file-like object and try treating it as such
         # Things that don't have seek will trigger an exception
         self.source.seek(0)
         for line in self.source:
             yield line.split()
     except AttributeError:
         # If it didn't work like a file, use it as a string filename
         with utils.smart_open(self.source) as fin:
             for line in fin:
                 yield line.split()
def iterrules(path, transform):
    fi = smart_open(path)
    for line in fi:
        line = line.strip()
        if not line:
            continue
        fields = line.split()
        lhs = fields[0]
        num, den = fields[-1].split('/')
        num = float(num)
        den = float(den)
        rhs = fields[1:-2]  # fields[-2] is the yield function, which we are ignoring
        yield Rule(make_nonterminal(lhs), [make_nonterminal(s) for s in rhs], transform(num/den))
Exemplo n.º 22
0
    def __iter__(self):

        # try:
        #     self.source.seek(0)
        #     for line in self.source:
        #         yield utils.to_unicode(line).split()
        #
        # except AttributeError:
            # If it didn't work like a file, use it as a string filename
        with utils.smart_open(self.source) as fin:
            for line in fin:
                t3 = self.parse(line)
                if t3!=None:
                    yield t3
Exemplo n.º 23
0
def read_sentences(path):
    """
    read file line by line and split words
    :param path: file to read
    :return: array of lines
    """
    file = smart_open(path)
    sentences = []
    for line in file.readlines():
        tokens = line.strip().split(" ")
        tokens = list(filter(bool, tokens))
        sentences.append(tokens)

    return sentences
def iterlexicon(path, transform):
    fi = smart_open(path)
    for line in fi:
        line = line.strip()
        if not line:
            continue
        fields = line.split('\t')
        word = fields[0]
        for pair in fields[1:]:
            tag, fraction = pair.split(' ')
            num, den = fraction.split('/')
            num = float(num)
            den = float(den)
            yield Rule(make_nonterminal(tag), [make_terminal(word)], transform(num/den))
Exemplo n.º 25
0
 def __iter__(self):
     if not self.split:
         try:
             self.source.seek(0)
             for line in self.source:
                 k = utils.to_unicode(line.rstrip()).split("\t")
                 yield k[self.cont_col:],k[self.sent_col],k[self.cat_col]
         except AttributeError:
             with utils.smart_open(self.source) as fin:
                 for line in fin:
                     k = utils.to_unicode(line.rstrip()).split("\t")
                     yield k[self.cont_col:],k[self.sent_col],k[self.cat_col]
     else:
         if isinstance(self.source, list):
             split_files = self.source
         else:
             split_files = glob.glob(self.source+".[a-z][a-z]")
         if self.rand: random.shuffle(split_files)
         for source in split_files:
             with utils.smart_open(source) as fin:
                 for line in fin:
                     k = utils.to_unicode(line.rstrip()).split("\t")
                     yield k[self.cont_col:],k[self.sent_col],k[self.cat_col]
Exemplo n.º 26
0
def writeArk(filename, features, uttids, append=False):
    """
    Takes a list of feature matrices and a list of utterance IDs,
      and writes them to a Kaldi ark file.
    Returns a list of strings in the format "filename:offset",
      which can be used to write a Kaldi script file.
    """
    pointers = []
    with smart_open(filename, "ab" if append else "wb") as f:
        for feature, uttid in zip(features, uttids):
            writeString(f, uttid.encode('utf-8'))
            pointers.append("%s:%d" % (filename, f.tell()))
            writeMatrix(f, feature)
    return pointers
Exemplo n.º 27
0
def load_grammar(path, grammarfmt, transform):
    """
    Load a WCFG from a file.

    :args path: path to the grammar (or prefix path to rules and lexicon)
    :args grammarfmt: 'bar',  'discodop' or 'milos' (which looks like 'bar' but with terminals surrounded by quotes)
    :returns: WCFG
    """
    if grammarfmt == 'bar':
        istream = smart_open(path)
        grammar = wcfg.WCFG(wcfg.read_grammar_rules(istream, transform))
    elif grammarfmt == 'milos':
        istream = smart_open(path)
        grammar = wcfg.WCFG(
            wcfg.read_grammar_rules(istream, transform, strip_quotes=True))
    elif grammarfmt == 'discodop':
        grammar = discodopfmt.read_grammar('{0}.rules.gz'.format(path),
                                           '{0}.lex.gz'.format(path),
                                           transform)
    else:
        raise NotImplementedError("I don't know this grammar format: %s" %
                                  grammarfmt)
    return grammar
Exemplo n.º 28
0
def iterlexicon(path, transform):
    fi = smart_open(path)
    for line in fi:
        line = line.strip()
        if not line:
            continue
        fields = line.split('\t')
        word = fields[0]
        for pair in fields[1:]:
            tag, fraction = pair.split(' ')
            num, den = fraction.split('/')
            num = float(num)
            den = float(den)
            yield Rule(make_nonterminal(tag), [make_terminal(word)],
                       transform(num / den))
Exemplo n.º 29
0
 def __iter__(self):
     if not self.split:
         try:
             self.source.seek(0)
             for line in self.source:
                 k = utils.to_unicode(line.rstrip()).split("\t")
                 yield k[self.cont_col :], k[self.sent_col], k[self.cat_col]
         except AttributeError:
             with utils.smart_open(self.source) as fin:
                 for line in fin:
                     k = utils.to_unicode(line.rstrip()).split("\t")
                     yield k[self.cont_col :], k[self.sent_col], k[self.cat_col]
     else:
         if isinstance(self.source, list):
             split_files = self.source
         else:
             split_files = glob.glob(self.source + ".[a-z][a-z]")
         if self.rand:
             random.shuffle(split_files)
         for source in split_files:
             with utils.smart_open(source) as fin:
                 for line in fin:
                     k = utils.to_unicode(line.rstrip()).split("\t")
                     yield k[self.cont_col :], k[self.sent_col], k[self.cat_col]
Exemplo n.º 30
0
 def __iter__(self):
     for fname in os.listdir(self.dirname):
         fname = os.path.join(self.dirname, fname)
         if not os.path.isfile(fname):
             continue
         for line in utils.smart_open(fname):
             line = utils.to_unicode(line)
             # each file line is a single sentence in the Brown corpus
             # each token is WORD/POS_TAG
             token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2]
             # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff)
             words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()]
             if not words:  # don't bother sending out empty sentences
                 continue
             yield words
Exemplo n.º 31
0
def iterrules(path, transform):
    fi = smart_open(path)
    for line in fi:
        line = line.strip()
        if not line:
            continue
        fields = line.split()
        lhs = fields[0]
        num, den = fields[-1].split('/')
        num = float(num)
        den = float(den)
        rhs = fields[
            1:-2]  # fields[-2] is the yield function, which we are ignoring
        yield Rule(make_nonterminal(lhs), [make_nonterminal(s) for s in rhs],
                   transform(num / den))
Exemplo n.º 32
0
 def __iter__(self):
     for fname in os.listdir(self.dirname):
         fname = os.path.join(self.dirname, fname)
         if not os.path.isfile(fname):
             continue
         for item_no, line in enumerate(utils.smart_open(fname)):
             line = utils.to_unicode(line)
             # each file line is a single document in the Brown corpus
             # each token is WORD/POS_TAG
             token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2]
             # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff)
             words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()]
             if not words:  # don't bother sending out empty documents
                 continue
             yield TaggedDocument(words, ['%s_SENT_%s' % (fname, item_no)])
Exemplo n.º 33
0
 def __iter__(self):
     try:
         self.source.seek(0)
         for line in self.source:
             k = utils.to_unicode(line.rstrip()).split("\t")
             categories = k[3].split(" ")
             for cat in categories:
                 if "/" in cat: continue
                 yield k[4:], k[1], cat
     except AttributeError:
         with utils.smart_open(self.source) as fin:
             for line in fin:
                 k = utils.to_unicode(line.rstrip()).split("\t")
                 categories = k[3].split(" ")
                 for cat in categories:
                     if "/" in cat: continue
                     yield k[4:], k[1], cat
Exemplo n.º 34
0
 def __iter__(self):
     # the entire corpus is one gigantic line -- there are no sentence marks at all
     # so just split the sequence of tokens arbitrarily: 1 sentence = 1000 tokens
     sentence, rest, max_sentence_length = [], b'', 1000
     with utils.smart_open(self.fname) as fin:
         while True:
             text = rest + fin.read(8192)  # avoid loading the entire file (=1 line) into RAM
             if text == rest:  # EOF
                 sentence.extend(rest.split()) # return the last chunk of words, too (may be shorter/longer)
                 if sentence:
                     yield sentence
                 break
             last_token = text.rfind(b' ')  # the last token may have been split in two... keep it for the next iteration
             words, rest = (utils.to_unicode(text[:last_token]).split(), text[last_token:].strip()) if last_token >= 0 else ([], text)
             sentence.extend(words)
             while len(sentence) >= max_sentence_length:
                 yield sentence[:max_sentence_length]
                 sentence = sentence[max_sentence_length:]
Exemplo n.º 35
0
    def save_doc2vec_format(self, fname):
        """
        Store the input-hidden weight matrix in the same format used by the original
        C word2vec-tool, for compatibility.

        """
        logger.info("storing %sx%s projection weights into %s" %
                    (self.sents_len, self.layer1_size, fname))
        assert (self.sents_len, self.layer1_size) == self.sents.shape
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(utils.to_utf8("%s %s\n" % self.sents.shape))
            # store in sorted order: most frequent words at the top
            for sent_no in xrange(self.sents_len):
                row = self.sents[sent_no]
                fout.write(
                    utils.to_utf8("sent_%d %s\n" %
                                  (sent_no, ' '.join("%f" % val
                                                     for val in row))))
Exemplo n.º 36
0
def main():
    utils.unicodefy_std_io()

    parser = argparse.ArgumentParser(
        description='Generates a pair of DSA keys')
    parser.add_argument('--version',
                        action='version',
                        version='DSA Key Generator {}'.format(__version__))
    parser.add_argument(
        '-o',
        '--output',
        default=utils.STD_IO_MARK,
        help=
        'where to save the generated keys pair, default "%(default)s" means print keys to stdout'
    )
    parser.add_argument(
        '-b',
        '--bits',
        type=int,
        default=1024,
        help=
        'the length of the prime to be generated in bits (default: %(default)s)'
    )
    parser.add_argument(
        '--cipher',
        help=
        'name of symmetric key algorithm and mode to encrypt the private key, such as aes_128_cbc'
    )
    parser.add_argument(
        '--passphrase',
        help=
        'a password used to protect the private key when using `cipher`. If not given, '
        'you might be asked to enter password during generation process.')

    unicode_args = map(lambda s: unicode(s, sys.getfilesystemencoding()),
                       sys.argv)
    args = parser.parse_args(unicode_args[1:])

    dsa_key = DsaKey(args.bits)
    with utils.smart_open(args.output, mode='wb') as f:
        f.write(
            dsa_key.get_private_key(cipher=args.cipher,
                                    pass_phrase=args.passphrase))
        f.write(dsa_key.get_public_key())
Exemplo n.º 37
0
 def __iter__(self):
     try:
         self.source.seek(0)
         for line in self.source:
             k = utils.to_unicode(line.rstrip()).split("\t")
             categories = k[3].split(" ")
             for cat in categories:
                 if "/" in cat:
                     continue
                 yield k[4:], k[1], cat
     except AttributeError:
         with utils.smart_open(self.source) as fin:
             for line in fin:
                 k = utils.to_unicode(line.rstrip()).split("\t")
                 categories = k[3].split(" ")
                 for cat in categories:
                     if "/" in cat:
                         continue
                     yield k[4:], k[1], cat
Exemplo n.º 38
0
    def save_cat2vec_format(self, fname):
        """
        Store cat vectors

        """
        logger.info("storing %sx%s projection weights into %s" %
                    (self.cat_len, self.layer1_size, fname))
        assert (self.cat_len, self.layer1_size) == self.cats.shape
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(
                utils.to_utf8("#cats_len: %d\n#size:%d\n" % self.cats.shape))
            fout.write(
                utils.to_utf8(
                    "#sg:%d\n#hs:%d\n#negative:%d\n#cbow_mean:%d\n" %
                    (self.sg, self.hs, self.negative, self.cbow_mean)))
            for cat_id in self.cat_no_hash.keys():
                row = self.cats[self.cat_no_hash[cat_id]]
                fout.write(
                    utils.to_utf8("%s\t%s\n" %
                                  (cat_id, ' '.join("%f" % val
                                                    for val in row))))
Exemplo n.º 39
0
    def save_as_text(self, fname, sort_by_word=True):
        """
        Save this Dictionary to a text file, in format:
        `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Sorted by word,
        or by decreasing word frequency.

        Note: text format should be use for corpus inspection. Use `save`/`load`
        to store in binary format (pickle) for improved performance.
        """
        logger.info("saving dictionary mapping to %s", fname)
        with utils.smart_open(fname, 'wb') as fout:
            if sort_by_word:
                for token, tokenid in sorted(iteritems(self.token2id)):
                    line = "%i\t%s\t%i\n" % (tokenid, token,
                                             self.dfs.get(tokenid, 0))
                    fout.write(utils.to_utf8(line))
            else:
                for tokenid, freq in sorted(iteritems(self.dfs),
                                            key=lambda item: -item[1]):
                    line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq)
                    fout.write(utils.to_utf8(line))
Exemplo n.º 40
0
 def load_from_text(fname):
     """
     Load a previously stored Dictionary from a text file.
     Mirror function to `save_as_text`.
     """
     result = Dictionary()
     with utils.smart_open(fname) as f:
         for lineno, line in enumerate(f):
             line = utils.to_unicode(line)
             try:
                 wordid, word, docfreq = line[:-1].split('\t')
             except Exception:
                 raise ValueError("invalid line in dictionary file %s: %s" %
                                  (fname, line.strip()))
             wordid = int(wordid)
             if word in result.token2id:
                 raise KeyError(
                     'token %s is defined as ID %d and as ID %d' %
                     (word, wordid, result.token2id[word]))
             result.token2id[word] = wordid
             result.dfs[wordid] = int(docfreq)
     return result
Exemplo n.º 41
0
def main():
    utils.unicodefy_std_io()

    parser = argparse.ArgumentParser(
        description='Generates an atlassian license')
    parser.add_argument(
        '--version',
        action='version',
        version='Atlassian License Generator {}'.format(__version__))

    parser.add_argument(
        'template',
        help=
        'path to a license template yaml file, such as `templates/jira.yml`')
    parser.add_argument('organisation',
                        help='your company name used to register the product')
    parser.add_argument(
        'server_id',
        nargs='?',
        help='server id, usually in format of `ABCD-1234-EFGH-5678`')

    parser.add_argument(
        '-o',
        '--output',
        default=utils.STD_IO_MARK,
        help='where to save the generated license, default "%(default)s" means '
        'stdout')
    parser.add_argument(
        '--show-raw',
        action='store_true',
        help='also prints raw (not encoded) license content to stderr')
    parser.add_argument(
        '-k',
        '--key',
        '--private-key',
        default='calfzhou.pem',
        help='a key file (contains at least a private DSA key) used to sign the '
        'license (default: %(default)s)')
    parser.add_argument(
        '--passphrase',
        help=
        'password used by the private key. If not given, you might be asked '
        'to enter it when needed.')

    def parse_variable_definition(text):
        parts = text.split('=', 1)
        if len(parts) < 2:
            raise argparse.ArgumentTypeError(
                'unrecognized variable definition "{}"'.format(text))
        return tuple(parts)

    group = parser.add_argument_group(
        'customizing license arguments',
        'use these arguments to over-write default license template '
        'or variables')

    group.add_argument(
        '-v',
        '--var',
        action='append',
        type=parse_variable_definition,
        help='custom variable used by template, e.g. -v number_of_users=200')

    unicode_args = map(lambda s: unicode(s, sys.getfilesystemencoding()),
                       sys.argv)
    args = parser.parse_args(unicode_args[1:])

    custom_variables = None
    if args.var:
        custom_variables = {key: value for key, value in args.var}

    generator = AtlanssianLicenseGenerator(args.template)
    atlassian_license = generator.generate(args.organisation, args.server_id,
                                           custom_variables)

    if args.show_raw:
        print(atlassian_license, file=sys.stderr)

    encoder = AtlassianLicenseEncoder(args.key, args.passphrase)
    encoded_license = encoder.encode(atlassian_license)

    with utils.smart_open(args.output, mode='wb') as f:
        f.write(encoded_license)
Exemplo n.º 42
0
 def __init__(self, fname):
     self.fname = fname
     if fname.endswith(".gz") or fname.endswith('.bz2'):
         raise NotImplementedError("compressed output not supported with MmWriter")
     self.fout = utils.smart_open(self.fname, 'wb+') # open for both reading and writing
     self.headers_written = False
Exemplo n.º 43
0
    def load_word2vec_format(cls, fname, fvocab=None, binary=False, norm_only=True):
        """
        Load the input-hidden weight matrix from the original C word2vec-tool format.

        Note that the information stored in the file is incomplete (the binary tree is missing),
        so while you can query for word similarity etc., you cannot continue training
        with a model loaded this way.

        `binary` is a boolean indicating whether the data is in binary word2vec format.
        `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.
        Word counts are read from `fvocab` filename, if set (this is the file generated
        by `-save-vocab` flag of the original C tool).
        """
        counts = None
        if fvocab is not None:
            logger.info("loading word counts from %s" % (fvocab))
            counts = {}
            with utils.smart_open(fvocab) as fin:
                for line in fin:
                    word, count = utils.to_unicode(line).strip().split()
                    counts[word] = int(count)

        logger.info("loading projection weights from %s" % (fname))
        with utils.smart_open(fname) as fin:
            header = utils.to_unicode(fin.readline())
            vocab_size, layer1_size = map(int, header.split())  # throws for invalid file format
            result = Word2Vec(size=layer1_size)
            result.syn0 = zeros((vocab_size, layer1_size), dtype=REAL)
            if binary:
                binary_len = dtype(REAL).itemsize * layer1_size
                for line_no in xrange(vocab_size):
                    # mixed text and binary: read text first, then binary
                    word = []
                    while True:
                        ch = fin.read(1)
                        if ch == b' ':
                            break
                        if ch != b'\n':  # ignore newlines in front of words (some binary files have newline, some don't)
                            word.append(ch)
                    word = utils.to_unicode(b''.join(word))
                    if counts is None:
                        result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no)
                    elif word in counts:
                        result.vocab[word] = Vocab(index=line_no, count=counts[word])
                    else:
                        logger.warning("vocabulary file is incomplete")
                        result.vocab[word] = Vocab(index=line_no, count=None)
                    result.index2word.append(word)
                    result.syn0[line_no] = fromstring(fin.read(binary_len), dtype=REAL)
            else:
                for line_no, line in enumerate(fin):
                    parts = utils.to_unicode(line).split()
                    if len(parts) != layer1_size + 1:
                        raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no))
                    word, weights = parts[0], map(REAL, parts[1:])
                    if counts is None:
                        result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no)
                    elif word in counts:
                        result.vocab[word] = Vocab(index=line_no, count=counts[word])
                    else:
                        logger.warning("vocabulary file is incomplete")
                        result.vocab[word] = Vocab(index=line_no, count=None)
                    result.index2word.append(word)
                    result.syn0[line_no] = weights
        logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname))
        result.init_sims(norm_only)
        return result
Exemplo n.º 44
0
    def accuracy(self, questions, restrict_vocab=30000):
        """
        Compute accuracy of the model. `questions` is a filename where lines are
        4-tuples of words, split into sections by ": SECTION NAME" lines.
        See https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt for an example.

        The accuracy is reported (=printed to log and returned as a list) for each
        section separately, plus there's one aggregate summary at the end.

        Use `restrict_vocab` to ignore all questions containing a word whose frequency
        is not in the top-N most frequent words (default top 30,000).

        This method corresponds to the `compute-accuracy` script of the original C word2vec.

        """
        ok_vocab = dict(sorted(iteritems(self.vocab),
                               key=lambda item: -item[1].count)[:restrict_vocab])
        ok_index = set(v.index for v in itervalues(ok_vocab))

        def log_accuracy(section):
            correct, incorrect = section['correct'], section['incorrect']
            if correct + incorrect > 0:
                logger.info("%s: %.1f%% (%i/%i)" %
                    (section['section'], 100.0 * correct / (correct + incorrect),
                    correct, correct + incorrect))

        sections, section = [], None
        for line_no, line in enumerate(utils.smart_open(questions)):
            # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
            line = utils.to_unicode(line)
            if line.startswith(': '):
                # a new section starts => store the old section
                if section:
                    sections.append(section)
                    log_accuracy(section)
                section = {'section': line.lstrip(': ').strip(), 'correct': 0, 'incorrect': 0}
            else:
                if not section:
                    raise ValueError("missing section header before line #%i in %s" % (line_no, questions))
                try:
                    a, b, c, expected = [word.lower() for word in line.split()]  # TODO assumes vocabulary preprocessing uses lowercase, too...
                except:
                    logger.info("skipping invalid line #%i in %s" % (line_no, questions))
                if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
                    logger.debug("skipping line #%i with OOV words: %s" % (line_no, line))
                    continue

                ignore = set(self.vocab[v].index for v in [a, b, c])  # indexes of words to ignore
                predicted = None
                # find the most likely prediction, ignoring OOV words and input words
                for index in argsort(self.most_similar(positive=[b, c], negative=[a], topn=False))[::-1]:
                    if index in ok_index and index not in ignore:
                        predicted = self.index2word[index]
                        if predicted != expected:
                            logger.debug("%s: expected %s, predicted %s" % (line.strip(), expected, predicted))
                        break
                section['correct' if predicted == expected else 'incorrect'] += 1
        if section:
            # store the last section, too
            sections.append(section)
            log_accuracy(section)

        total = {'section': 'total', 'correct': sum(s['correct'] for s in sections), 'incorrect': sum(s['incorrect'] for s in sections)}
        log_accuracy(total)
        sections.append(total)
        return sections
Exemplo n.º 45
0
    def load_word2vec_format(
            cls,
            fname,
            fvocab=None,
            binary=False,
            encoding='utf8',
            unicode_errors='strict',
            limit=None,
            datatype=REAL):
        """
        Load the input-hidden weight matrix from the original C word2vec-tool format.
        Note that the information stored in the file is incomplete (the binary tree is missing),
        so while you can query for word similarity etc., you cannot continue training
        with a model loaded this way.
        `binary` is a boolean indicating whether the data is in binary word2vec format.
        `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.
        Word counts are read from `fvocab` filename, if set (this is the file generated
        by `-save-vocab` flag of the original C tool).
        If you trained the C model using non-utf8 encoding for words, specify that
        encoding in `encoding`.
        `unicode_errors`, default 'strict', is a string suitable to be passed as the `errors`
        argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source
        file may include word tokens truncated in the middle of a multibyte unicode character
        (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help.
        `limit` sets a maximum number of word-vectors to read from the file. The default,
        None, means read all.
        `datatype` (experimental) can coerce dimensions to a non-default float type (such
        as np.float16) to save memory. (Such types may result in much slower bulk operations
        or incompatibility with optimized routines.)
        """
        counts = None
        if fvocab is not None:
            print("loading word counts from %s" % fvocab)
            counts = {}
            with utils.smart_open(fvocab) as fin:
                for line in fin:
                    word, count = utils.to_unicode(line).strip().split()
                    counts[word] = int(count)

        print("loading projection weights from %s" % fname)
        with utils.smart_open(fname) as fin:
            header = utils.to_unicode(fin.readline(), encoding=encoding)
            # throws for invalid file format
            vocab_size, vector_size = (int(x) for x in header.split())
            if limit:
                vocab_size = min(vocab_size, limit)
            result = cls()
            result.vector_size = vector_size
            result.syn0 = zeros((vocab_size, vector_size), dtype=datatype)

            def add_word(word, weights):
                word_id = len(result.vocab)
                # print("word id: %d, word: %s, weights: %s" % (word_id, word, weights))
                if word in result.vocab:
                    print(
                        "duplicate word '%s' in %s, ignoring all but first" %
                        (word, fname))
                    return
                if counts is None:
                    # most common scenario: no vocab file given. just make up
                    # some bogus counts, in descending order
                    result.vocab[word] = Vocab(
                        index=word_id, count=vocab_size - word_id)
                elif word in counts:
                    # use count from the vocab file
                    result.vocab[word] = Vocab(
                        index=word_id, count=counts[word])
                else:
                    # vocab file given, but word is missing -- set count to
                    # None (TODO: or raise?)
                    print(
                        "vocabulary file is incomplete: '%s' is missing" %
                        word)
                    result.vocab[word] = Vocab(index=word_id, count=None)
                result.syn0[word_id] = weights
                result.index2word.append(word)

            if binary:
                binary_len = dtype(REAL).itemsize * vector_size
                for _ in xrange(vocab_size):
                    # mixed text and binary: read text first, then binary
                    word = []
                    while True:
                        ch = fin.read(1)
                        if ch == b' ':
                            break
                        if ch == b'':
                            raise EOFError(
                                "unexpected end of input; is count incorrect or file otherwise damaged?")
                        # ignore newlines in front of words (some binary files
                        # have)
                        if ch != b'\n':
                            word.append(ch)
                    word = utils.to_unicode(
                        b''.join(word), encoding=encoding, errors=unicode_errors)
                    weights = fromstring(fin.read(binary_len), dtype=REAL)
                    add_word(word, weights)
            else:
                for line_no in xrange(vocab_size):
                    line = fin.readline()
                    if line == b'':
                        raise EOFError(
                            "unexpected end of input; is count incorrect or file otherwise damaged?")
                    parts = utils.to_unicode(
                        line.rstrip(),
                        encoding=encoding,
                        errors=unicode_errors).split(" ")
                    if len(parts) != vector_size + 1:
                        raise ValueError(
                            "invalid vector on line %s (is this really the text format?)" %
                            line_no)
                    word, weights = parts[0], [REAL(x) for x in parts[1:]]
                    add_word(word, weights)
        if result.syn0.shape[0] != len(result.vocab):
            print(
                "duplicate words detected, shrinking matrix size from %i to %i" %
                (result.syn0.shape[0], len(result.vocab)))
            result.syn0 = ascontiguousarray(result.syn0[: len(result.vocab)])
        assert (len(result.vocab), vector_size) == result.syn0.shape

        print("loaded %s matrix from %s" % (result.syn0.shape, fname))
        return result
Exemplo n.º 46
0
    def load_word2vec_format(cls,
                             fname,
                             fvocab=None,
                             binary=False,
                             encoding='utf8',
                             unicode_errors='strict',
                             limit=None,
                             datatype=REAL):
        """
        Load the input-hidden weight matrix from the original C word2vec-tool format.
        Note that the information stored in the file is incomplete (the binary tree is missing),
        so while you can query for word similarity etc., you cannot continue training
        with a model loaded this way.
        `binary` is a boolean indicating whether the data is in binary word2vec format.
        `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.
        Word counts are read from `fvocab` filename, if set (this is the file generated
        by `-save-vocab` flag of the original C tool).
        If you trained the C model using non-utf8 encoding for words, specify that
        encoding in `encoding`.
        `unicode_errors`, default 'strict', is a string suitable to be passed as the `errors`
        argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source
        file may include word tokens truncated in the middle of a multibyte unicode character
        (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help.
        `limit` sets a maximum number of word-vectors to read from the file. The default,
        None, means read all.
        `datatype` (experimental) can coerce dimensions to a non-default float type (such
        as np.float16) to save memory. (Such types may result in much slower bulk operations
        or incompatibility with optimized routines.)
        """
        counts = None
        if fvocab is not None:
            logging.debug("loading word counts from %s" % fvocab)
            counts = {}
            with utils.smart_open(fvocab) as fin:
                for line in fin:
                    word, count = utils.to_unicode(line).strip().split()
                    counts[word] = int(count)

        logging.debug("loading projection weights from %s" % fname)
        with utils.smart_open(fname) as fin:
            header = utils.to_unicode(fin.readline(), encoding=encoding)
            # throws for invalid file format
            vocab_size, vector_size = (int(x) for x in header.split())
            if limit:
                vocab_size = min(vocab_size, limit)
            result = cls()
            result.vector_size = vector_size
            result.syn0 = zeros((vocab_size, vector_size), dtype=datatype)

            def add_word(word, weights):
                word_id = len(result.vocab)
                # logging.debug("word id: %d, word: %s, weights: %s" % (word_id, word, weights))
                if word in result.vocab:
                    logging.debug(
                        "duplicate word '%s' in %s, ignoring all but first" %
                        (word, fname))
                    return
                if counts is None:
                    # most common scenario: no vocab file given. just make up
                    # some bogus counts, in descending order
                    result.vocab[word] = Vocab(index=word_id,
                                               count=vocab_size - word_id)
                elif word in counts:
                    # use count from the vocab file
                    result.vocab[word] = Vocab(index=word_id,
                                               count=counts[word])
                else:
                    # vocab file given, but word is missing -- set count to
                    # None (TODO: or raise?)
                    logging.debug(
                        "vocabulary file is incomplete: '%s' is missing" %
                        word)
                    result.vocab[word] = Vocab(index=word_id, count=None)
                result.syn0[word_id] = weights
                result.index2word.append(word)

            if binary:
                binary_len = dtype(REAL).itemsize * vector_size
                for _ in xrange(vocab_size):
                    # mixed text and binary: read text first, then binary
                    word = []
                    while True:
                        ch = fin.read(1)
                        if ch == b' ':
                            break
                        if ch == b'':
                            raise EOFError(
                                "unexpected end of input; is count incorrect or file otherwise damaged?"
                            )
                        # ignore newlines in front of words (some binary files
                        # have)
                        if ch != b'\n':
                            word.append(ch)
                    word = utils.to_unicode(b''.join(word),
                                            encoding=encoding,
                                            errors=unicode_errors)
                    weights = fromstring(fin.read(binary_len), dtype=REAL)
                    add_word(word, weights)
            else:
                for line_no in xrange(vocab_size):
                    line = fin.readline()
                    if line == b'':
                        raise EOFError(
                            "unexpected end of input; is count incorrect or file otherwise damaged?"
                        )
                    parts = utils.to_unicode(line.rstrip(),
                                             encoding=encoding,
                                             errors=unicode_errors).split(" ")
                    if len(parts) != vector_size + 1:
                        raise ValueError(
                            "invalid vector on line %s (is this really the text format?)"
                            % line_no)
                    word, weights = parts[0], [REAL(x) for x in parts[1:]]
                    add_word(word, weights)
        if result.syn0.shape[0] != len(result.vocab):
            logging.debug(
                "duplicate words detected, shrinking matrix size from %i to %i"
                % (result.syn0.shape[0], len(result.vocab)))
            result.syn0 = ascontiguousarray(result.syn0[:len(result.vocab)])
        assert (len(result.vocab), vector_size) == result.syn0.shape
        '''
        KDTree
        Build KDTree with vectors.
        http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KDTree.html#sklearn.neighbors.KDTree
        '''
        result.kdt = KDTree(result.syn0, leaf_size=10, metric="euclidean")
        logging.debug("loaded %s matrix from %s" % (result.syn0.shape, fname))
        return result
Exemplo n.º 47
0
    def get_tokens(self, words):
        for i in range(1, len(self.line)):
            if not words.get(self.line[i - 1]):
                words[self.line[i - 1]] = {self.line[i]: 1}
            elif not words[self.line[i - 1]].get(self.line[i]):
                words[self.line[i - 1]][self.line[i]] = 1
            else:
                words[self.line[i - 1]][self.line[i]] += 1
        return words


parser = argparse.ArgumentParser(
    description='A script which collects words from file')
parser.add_argument('--input-dir',
                    dest='directory',
                    type=str,
                    default='stdin',
                    help='File directory')
parser.add_argument('--model', required=True, type=str, help='Save file')
parser.add_argument('--lc', action='store_true', help='Switch to lowercase')
args = parser.parse_args()

if __name__ == '__main__':
    words = {}
    with utils.smart_open(args.directory, "r") as fin:
        for line in utils.all_files_generator(fin):
            p = Parser(line)
            p.preprocess(args.lc)
            p.get_tokens(words)
    utils.dump_dictionary(args.model, words)