Exemplos de smart_open em Python, exemplos de utils.smart_open em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: word2vec.py Projeto: rakutentech/category2vec

    def save_word2vec_format(self, fname, fvocab=None, binary=False):
        """
        Store the input-hidden weight matrix in the same format used by the original
        C word2vec-tool, for compatibility.

        """
        if fvocab is not None:
            logger.info("Storing vocabulary in %s" % (fvocab))
            with utils.smart_open(fvocab, 'wb') as vout:
                for word, vocab in sorted(iteritems(self.vocab),
                                          key=lambda item: -item[1].count):
                    vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count)))
        logger.info("storing %sx%s projection weights into %s" %
                    (len(self.vocab), self.layer1_size, fname))
        assert (len(self.vocab), self.layer1_size) == self.syn0.shape
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(utils.to_utf8("%s %s\n" % self.syn0.shape))
            # store in sorted order: most frequent words at the top
            for word, vocab in sorted(iteritems(self.vocab),
                                      key=lambda item: -item[1].count):
                row = self.syn0[vocab.index]
                if binary:
                    fout.write(utils.to_utf8(word) + b" " + row.tostring())
                else:
                    fout.write(
                        utils.to_utf8("%s %s\n" %
                                      (word, ' '.join("%f" % val
                                                      for val in row))))

Exemplo n.º 2

0

Exibir arquivo

def main():
    utils.unicodefy_std_io()

    parser = argparse.ArgumentParser(
        description='Decodes (and verifies) an atlassian license')
    parser.add_argument(
        '--version',
        action='version',
        version='Atlassian License Decoder {}'.format(__version__))
    parser.add_argument(
        '-k',
        '--key',
        '--public-key',
        default='atlassian.pem',
        help=
        'a key file (contains at least a public DSA key) used to verify license (default: '
        '%(default)s)')
    parser.add_argument('-V',
                        '--no-verify',
                        dest='verify',
                        action='store_false',
                        help='skip license verification step')
    parser.add_argument(
        '-i',
        '--input',
        default=utils.STD_IO_MARK,
        help='from where to read license, default "%(default)s" means stdin')
    parser.add_argument(
        '-o',
        '--output',
        default=utils.STD_IO_MARK,
        help=
        'where to save the decoded license, default "%(default)s" means stdout'
    )

    unicode_args = map(lambda s: unicode(s, sys.getfilesystemencoding()),
                       sys.argv)
    args = parser.parse_args(unicode_args[1:])

    with utils.smart_open(args.input, mode='rb') as f:
        atlassian_license = f.read()

    decoder = AtlassianLicenseDecoder(args.key)
    decompressed_content, verified = decoder.decode(atlassian_license,
                                                    need_verify=args.verify)

    with utils.smart_open(args.output, mode='wb') as f:
        f.write(decompressed_content)

    if verified is not None and not verified:
        print(
            '\nWARNING: the license can NOT be verified by the given public key',
            file=sys.stderr)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: cpl.py Projeto: guy-david/cpl

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('input_file', nargs='+', help='Input files')
    parser.add_argument('-o', '--output-file', default='-', help='Output path')
    args = parser.parse_args()

    with utils.smart_open(args.output_file, 'w') as output_file:
        for input_path in args.input_file:
            with utils.smart_open(input_path, 'r') as input_file:
                parser = Parser(input_file)
                stmts = parser.parse()

                code_gen = CodeGenerator('quad')
                code_gen.gen(stmts)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: alignment.py Projeto: talkhouli/sockeye

def main():
    params = argparse.ArgumentParser(description='Alignment CLI')
    add_parameters(params)
    args = params.parse_args()

    trg_lengths = [len(x) for x in read_sentences(args.target)]
    src_lengths = [len(x) for x in read_sentences(args.source)]

    alignments, is_multiline = read_alignment_file(path=args.alignment,
                                                   trg_lengths=trg_lengths,
                                                   src_lengths=src_lengths)

    alignments = process_alignments(
        alignments=alignments,
        unaligned_target=args.unaligned_target,
        multiply_aligned_target=args.multiply_aligned_target,
        eps_index=args.unaligned_target_epsilon_index,
        bbn_multiply_aligned_target=args.bbn_multiply_aligned_target,
        bbn_unaligned_target=args.bbn_unaligned_target)

    if args.output_format is None:
        flat_output = not is_multiline
    else:
        flat_output = True if args.output_format == "flat" else False

    output_stream = sys.stdout if args.output is None else smart_open(
        args.output, mode='wt')
    print_alignments(alignments=alignments,
                     stream=output_stream,
                     print_unaligned_target=True
                     if args.unaligned_target == "keep" else False,
                     eps_index=args.unaligned_target_epsilon_index,
                     flat=flat_output)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: word2vec.py Projeto: rakutentech/category2vec

 def __iter__(self):
     # the entire corpus is one gigantic line -- there are no sentence marks at all
     # so just split the sequence of tokens arbitrarily: 1 sentence = 1000 tokens
     sentence, rest, max_sentence_length = [], b'', 1000
     with utils.smart_open(self.fname) as fin:
         while True:
             text = rest + fin.read(
                 8192)  # avoid loading the entire file (=1 line) into RAM
             if text == rest:  # EOF
                 sentence.extend(
                     rest.split()
                 )  # return the last chunk of words, too (may be shorter/longer)
                 if sentence:
                     yield sentence
                 break
             last_token = text.rfind(
                 b' '
             )  # the last token may have been split in two... keep it for the next iteration
             words, rest = (
                 utils.to_unicode(text[:last_token]).split(),
                 text[last_token:].strip()) if last_token >= 0 else ([],
                                                                     text)
             sentence.extend(words)
             while len(sentence) >= max_sentence_length:
                 yield sentence[:max_sentence_length]
                 sentence = sentence[max_sentence_length:]

Exemplo n.º 6

0

Exibir arquivo

    def docbyoffset(self, offset):
        """Return document at file offset `offset` (in bytes)"""
        # empty documents are not stored explicitly in MM format, so the index marks
        # them with a special offset, -1.
        if offset == -1:
            return []
        if isinstance(self.input, string_types):
            fin = utils.smart_open(self.input)
        else:
            fin = self.input

        fin.seek(offset)  # works for gzip/bz2 input, too
        previd, document = -1, []
        for line in fin:
            docid, termid, val = line.split()
            if not self.transposed:
                termid, docid = docid, termid
            docid, termid, val = int(docid) - 1, int(termid) - 1, float(
                val
            )  # -1 because matrix market indexes are 1-based => convert to 0-based
            assert previd <= docid, "matrix columns must come in ascending order"
            if docid != previd:
                if previd >= 0:
                    return document
                previd = docid

            document.append((
                termid,
                val,
            ))  # add another field to the current document
        return document

Exemplo n.º 7

0

Exibir arquivo

Arquivo: alignment.py Projeto: talkhouli/sockeye

def read_alignment_file(path, trg_lengths, src_lengths):
    """
    read flat alignment file
    :param path: path to alignment file
    :param trg_lengths: array of target lengths (for each sentence)
    :param src_lengths: array of source lengths (for each sentence)
    :return: array of alignments (unprocessed)
    """
    check_condition(
        len(trg_lengths) == len(src_lengths),
        "source and target sentences must be parallel")
    file = smart_open(path)
    content = file.readlines()
    if len(content) == len(trg_lengths):
        is_multiline = False
        alignments = _read_flat_alignment_file(content=content,
                                               trg_lengths=trg_lengths)
    else:
        is_multiline = True
        alignments = _read_multiline_alignment_file(content=content,
                                                    trg_lengths=trg_lengths)

    check_condition(
        len(alignments) == len(trg_lengths), "alignment mst be parallel")
    return alignments, is_multiline

Exemplo n.º 8

0

Exibir arquivo

Arquivo: kaldi_io.py Projeto: entn-at/bmilde_unspeech

def readArk(filename, limit=numpy.inf, memmap_dir='', memmap_dtype='float32'):
    """
    Reads the features in a Kaldi ark file.
    Returns a list of feature matrices and a list of the utterance IDs.
    """
    features = []
    uttids = []
    with smart_open(filename, "rb") as f:
        while True:
            try:
                uttid = readString(f).decode('utf-8')
            except ValueError:
                break
            feature = readMatrix(f)
            # use a memmap dir to hold the array content on a disk (e.g. ssd cache that is larger than your main memory)
            if memmap_dir != '':
                feature_mmap = numpy.memmap(memmap_dir + '/' + uttid,
                                            dtype=memmap_dtype,
                                            mode='w+',
                                            shape=feature.shape)
                feature_mmap[:] = feature[:]
                feature_mmap.flush()
                features.append(feature_mmap)
                del feature
            else:
                features.append(feature)
            uttids.append(uttid)
            if len(features) == limit: break
    if memmap_dir != '':
        with io.open(memmap_dir + '/' + 'feature_map', 'w') as feature_map:
            for uttid, feature in zip(uttids, features):
                feature_map.write(uttid + " %i %i\n" %
                                  (feature.shape[0], feature.shape[1]))
    return features, uttids

Exemplo n.º 9

0

Exibir arquivo

Arquivo: matutils.py Projeto: tomkocmi/SubGram

    def docbyoffset(self, offset):
        """Return document at file offset `offset` (in bytes)"""
        # empty documents are not stored explicitly in MM format, so the index marks
        # them with a special offset, -1.
        if offset == -1:
            return []
        if isinstance(self.input, string_types):
            fin = utils.smart_open(self.input)
        else:
            fin = self.input

        fin.seek(offset) # works for gzip/bz2 input, too
        previd, document = -1, []
        for line in fin:
            docid, termid, val = line.split()
            if not self.transposed:
                termid, docid = docid, termid
            docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) # -1 because matrix market indexes are 1-based => convert to 0-based
            assert previd <= docid, "matrix columns must come in ascending order"
            if docid != previd:
                if previd >= 0:
                    return document
                previd = docid

            document.append((termid, val,)) # add another field to the current document
        return document

Exemplo n.º 10

0

Exibir arquivo

 def __init__(self, fname):
     self.fname = fname
     if fname.endswith(".gz") or fname.endswith('.bz2'):
         raise NotImplementedError(
             "compressed output not supported with MmWriter")
     self.fout = utils.smart_open(
         self.fname, 'wb+')  # open for both reading and writing
     self.headers_written = False

Exemplo n.º 11

0

Exibir arquivo

Arquivo: kaldi_io.py Projeto: entn-at/bmilde_unspeech

def writeScp(filename, uttids, pointers, append=False):
    """
    Takes a list of utterance IDs and a list of strings in the format "filename:offset",
      and writes them to a Kaldi script file.
    """
    with smart_open(filename, "a" if append else "w") as f:
        for uttid, pointer in zip(uttids, pointers):
            f.write("%s %s\n" % (uttid, pointer))

Exemplo n.º 12

0

Exibir arquivo

Arquivo: reader.py Projeto: Iason/Thesis_sampling_from_probabilistic_context_free_grammars

def load_grammar(path, grammarfmt, transform):
    """
    Load a WCFG from a file.

    :args path: path to the grammar (or prefix path to rules and lexicon)
    :args grammarfmt: 'bar',  'discodop' or 'milos' (which looks like 'bar' but with terminals surrounded by quotes)
    :returns: WCFG
    """
    if grammarfmt == 'bar':
        istream = smart_open(path)
        grammar = wcfg.WCFG(wcfg.read_grammar_rules(istream, transform))
    elif grammarfmt == 'milos':
        istream = smart_open(path)
        grammar = wcfg.WCFG(wcfg.read_grammar_rules(istream, transform, strip_quotes=True))
    elif grammarfmt == 'discodop':
        grammar = discodopfmt.read_grammar('{0}.rules.gz'.format(path), '{0}.lex.gz'.format(path), transform)
    else:
        raise NotImplementedError("I don't know this grammar format: %s" % grammarfmt)
    return grammar

Exemplo n.º 13

0

Exibir arquivo

Arquivo: kaldi_io.py Projeto: entn-at/bmilde_unspeech

def readScp(filename, limit=numpy.inf, memmap_dir='', memmap_dtype='float32'):
    """
    Reads the features in a Kaldi script file.
    Returns a list of feature matrices and a list of the utterance IDs.
    """
    features = []
    uttids = []
    with smart_open(filename, "r") as f:
        for line in f:
            uttid, pointer = line.strip().split()
            p = pointer.rfind(":")
            arkfile, offset = pointer[:p], int(pointer[p + 1:])
            with smart_open(arkfile, "rb") as g:
                g.seek(offset)
                feature = readMatrix(g)
            features.append(feature)
            uttids.append(uttid)
            if len(features) == limit: break
    return features, uttids

Exemplo n.º 14

0

Exibir arquivo

    def save_word2vec_format(self,
                             fname,
                             fvocab=None,
                             binary=False,
                             total_vec=None):
        """
        Store the input-hidden weight matrix in the same format used by the original
        C word2vec-tool, for compatibility.

         `fname` is the file used to save the vectors in
         `fvocab` is an optional file used to save the vocabulary
         `binary` is an optional boolean indicating whether the data is to be saved
         in binary word2vec format (default: False)
         `total_vec` is an optional parameter to explicitly specify total no. of vectors
         (in case word vectors are appended with document vectors afterwards)

        """
        if total_vec is None:
            total_vec = len(self.vocab)
        vector_size = self.syn0.shape[1]
        if fvocab is not None:
            logger.info("storing vocabulary in %s" % (fvocab))
            with utils.smart_open(fvocab, 'wb') as vout:
                for word, vocab in sorted(iteritems(self.vocab),
                                          key=lambda item: -item[1].count):
                    vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count)))
        logger.info("storing %sx%s projection weights into %s" %
                    (total_vec, vector_size, fname))
        assert (len(self.vocab), vector_size) == self.syn0.shape
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
            # store in sorted order: most frequent words at the top
            for word, vocab in sorted(iteritems(self.vocab),
                                      key=lambda item: -item[1].count):
                row = self.syn0[vocab.index]
                if binary:
                    fout.write(utils.to_utf8(word) + b" " + row.tostring())
                else:
                    fout.write(
                        utils.to_utf8("%s %s\n" %
                                      (word, ' '.join("%f" % val
                                                      for val in row))))

Exemplo n.º 15

0

Exibir arquivo

Arquivo: doc2vec.py Projeto: bailingwjw/OpinionMining

 def __iter__(self):
     """Iterate through the lines in the source."""
     try:
         # Assume it is a file-like object and try treating it as such
         # Things that don't have seek will trigger an exception
         self.source.seek(0)
         for item_no, line in enumerate(self.source):
             yield TaggedDocument(utils.to_unicode(line).split(), [item_no])
     except AttributeError:
         # If it didn't work like a file, use it as a string filename
         with utils.smart_open(self.source) as fin:
             for item_no, line in enumerate(fin):
                 yield TaggedDocument(utils.to_unicode(line).split(), [item_no])

Exemplo n.º 16

0

Exibir arquivo

Arquivo: doc2vec.py Projeto: jmomarty/NeuralLandPirates

 def __iter__(self):
     """Iterate through the lines in the source."""
     try:
         # Assume it is a file-like object and try treating it as such
         # Things that don't have seek will trigger an exception
         self.source.seek(0)
         for line in self.source:
             yield utils.to_unicode(line).split()
     except AttributeError:
         # If it didn't work like a file, use it as a string filename
         with utils.smart_open(self.source) as fin:
             for line in fin:
                 yield utils.to_unicode(line).split()

Exemplo n.º 17

0

Exibir arquivo

Arquivo: cat2vec.py Projeto: nathan2718/category2vec

    def save_cat2vec_format(self, fname):
        """
        Store cat vectors

        """
        logger.info("storing %sx%s projection weights into %s" % (self.cat_len, self.layer1_size, fname))
        assert (self.cat_len, self.layer1_size) == self.cats.shape
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(utils.to_utf8("#cats_len: %d\n#size:%d\n" % self.cats.shape))
            fout.write(utils.to_utf8("#sg:%d\n#hs:%d\n#negative:%d\n#cbow_mean:%d\n" % (self.sg,self.hs,self.negative,self.cbow_mean)))
            for cat_id in self.cat_no_hash.keys():
                row = self.cats[self.cat_no_hash[cat_id]]
                fout.write(utils.to_utf8("%s\t%s\n" % (cat_id, ' '.join("%f" % val for val in row))))

Exemplo n.º 18

0

Exibir arquivo

Arquivo: ynet_fetch.py Projeto: ziv/dotter

def output(html, out_filename):
    soup = BeautifulSoup(html, 'lxml')
    with utils.smart_open(out_filename, 'w', encoding='utf-8') as f:
        for p in soup.find_all('p'):
            bold = p.find(
                'font', attrs={'style': 'FONT-WEIGHT: bold; FONT-SIZE: 13px;'})
            if bold and bold.text.strip().endswith(':'):
                bold.decompose()
            if p.find('img'):
                continue
            text = p.text
            if text:
                print(text, file=f)

Exemplo n.º 19

0

Exibir arquivo

Arquivo: word2vec.py Projeto: nathan2718/category2vec

    def save_word2vec_format(self, fname, fvocab=None, binary=False):
        """
        Store the input-hidden weight matrix in the same format used by the original
        C word2vec-tool, for compatibility.

        """
        if fvocab is not None:
            logger.info("Storing vocabulary in %s" % (fvocab))
            with utils.smart_open(fvocab, 'wb') as vout:
                for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
                    vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count)))
        logger.info("storing %sx%s projection weights into %s" % (len(self.vocab), self.layer1_size, fname))
        assert (len(self.vocab), self.layer1_size) == self.syn0.shape
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(utils.to_utf8("%s %s\n" % self.syn0.shape))
            # store in sorted order: most frequent words at the top
            for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
                row = self.syn0[vocab.index]
                if binary:
                    fout.write(utils.to_utf8(word) + b" " + row.tostring())
                else:
                    fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))

Exemplo n.º 20

0

Exibir arquivo

Arquivo: enumerate.py Projeto: windweller/sentence2vec

 def __iter__(self):
     """Iterate through the lines in the source."""
     try:
         # Assume it is a file-like object and try treating it as such
         # Things that don't have seek will trigger an exception
         self.source.seek(0)
         for line in self.source:
             yield line.split()
     except AttributeError:
         # If it didn't work like a file, use it as a string filename
         with utils.smart_open(self.source) as fin:
             for line in fin:
                 yield line.split()

Exemplo n.º 21

0

Exibir arquivo

Arquivo: discodopfmt.py Projeto: Iason/Thesis_sampling_from_probabilistic_context_free_grammars

def iterrules(path, transform):
    fi = smart_open(path)
    for line in fi:
        line = line.strip()
        if not line:
            continue
        fields = line.split()
        lhs = fields[0]
        num, den = fields[-1].split('/')
        num = float(num)
        den = float(den)
        rhs = fields[1:-2]  # fields[-2] is the yield function, which we are ignoring
        yield Rule(make_nonterminal(lhs), [make_nonterminal(s) for s in rhs], transform(num/den))

Exemplo n.º 22

0

Exibir arquivo

Arquivo: kb2vec.py Projeto: v-shinc/KB2Vec

    def __iter__(self):

        # try:
        #     self.source.seek(0)
        #     for line in self.source:
        #         yield utils.to_unicode(line).split()
        #
        # except AttributeError:
            # If it didn't work like a file, use it as a string filename
        with utils.smart_open(self.source) as fin:
            for line in fin:
                t3 = self.parse(line)
                if t3!=None:
                    yield t3

Exemplo n.º 23

0

Exibir arquivo

Arquivo: alignment.py Projeto: talkhouli/sockeye

def read_sentences(path):
    """
    read file line by line and split words
    :param path: file to read
    :return: array of lines
    """
    file = smart_open(path)
    sentences = []
    for line in file.readlines():
        tokens = line.strip().split(" ")
        tokens = list(filter(bool, tokens))
        sentences.append(tokens)

    return sentences

Exemplo n.º 24

0

Exibir arquivo

Arquivo: discodopfmt.py Projeto: Iason/Thesis_sampling_from_probabilistic_context_free_grammars

def iterlexicon(path, transform):
    fi = smart_open(path)
    for line in fi:
        line = line.strip()
        if not line:
            continue
        fields = line.split('\t')
        word = fields[0]
        for pair in fields[1:]:
            tag, fraction = pair.split(' ')
            num, den = fraction.split('/')
            num = float(num)
            den = float(den)
            yield Rule(make_nonterminal(tag), [make_terminal(word)], transform(num/den))

Exemplo n.º 25

0

Exibir arquivo

 def __iter__(self):
     if not self.split:
         try:
             self.source.seek(0)
             for line in self.source:
                 k = utils.to_unicode(line.rstrip()).split("\t")
                 yield k[self.cont_col:],k[self.sent_col],k[self.cat_col]
         except AttributeError:
             with utils.smart_open(self.source) as fin:
                 for line in fin:
                     k = utils.to_unicode(line.rstrip()).split("\t")
                     yield k[self.cont_col:],k[self.sent_col],k[self.cat_col]
     else:
         if isinstance(self.source, list):
             split_files = self.source
         else:
             split_files = glob.glob(self.source+".[a-z][a-z]")
         if self.rand: random.shuffle(split_files)
         for source in split_files:
             with utils.smart_open(source) as fin:
                 for line in fin:
                     k = utils.to_unicode(line.rstrip()).split("\t")
                     yield k[self.cont_col:],k[self.sent_col],k[self.cat_col]

Exemplo n.º 26

0

Exibir arquivo

Arquivo: kaldi_io.py Projeto: entn-at/bmilde_unspeech

def writeArk(filename, features, uttids, append=False):
    """
    Takes a list of feature matrices and a list of utterance IDs,
      and writes them to a Kaldi ark file.
    Returns a list of strings in the format "filename:offset",
      which can be used to write a Kaldi script file.
    """
    pointers = []
    with smart_open(filename, "ab" if append else "wb") as f:
        for feature, uttid in zip(features, uttids):
            writeString(f, uttid.encode('utf-8'))
            pointers.append("%s:%d" % (filename, f.tell()))
            writeMatrix(f, feature)
    return pointers

Exemplo n.º 27

0

Exibir arquivo

Arquivo: reader.py Projeto: we1l1n/pcfg-sampling

def load_grammar(path, grammarfmt, transform):
    """
    Load a WCFG from a file.

    :args path: path to the grammar (or prefix path to rules and lexicon)
    :args grammarfmt: 'bar',  'discodop' or 'milos' (which looks like 'bar' but with terminals surrounded by quotes)
    :returns: WCFG
    """
    if grammarfmt == 'bar':
        istream = smart_open(path)
        grammar = wcfg.WCFG(wcfg.read_grammar_rules(istream, transform))
    elif grammarfmt == 'milos':
        istream = smart_open(path)
        grammar = wcfg.WCFG(
            wcfg.read_grammar_rules(istream, transform, strip_quotes=True))
    elif grammarfmt == 'discodop':
        grammar = discodopfmt.read_grammar('{0}.rules.gz'.format(path),
                                           '{0}.lex.gz'.format(path),
                                           transform)
    else:
        raise NotImplementedError("I don't know this grammar format: %s" %
                                  grammarfmt)
    return grammar

Exemplo n.º 28

0

Exibir arquivo

def iterlexicon(path, transform):
    fi = smart_open(path)
    for line in fi:
        line = line.strip()
        if not line:
            continue
        fields = line.split('\t')
        word = fields[0]
        for pair in fields[1:]:
            tag, fraction = pair.split(' ')
            num, den = fraction.split('/')
            num = float(num)
            den = float(den)
            yield Rule(make_nonterminal(tag), [make_terminal(word)],
                       transform(num / den))

Exemplo n.º 29

0

Exibir arquivo

Arquivo: sentences.py Projeto: rakutentech/category2vec

 def __iter__(self):
     if not self.split:
         try:
             self.source.seek(0)
             for line in self.source:
                 k = utils.to_unicode(line.rstrip()).split("\t")
                 yield k[self.cont_col :], k[self.sent_col], k[self.cat_col]
         except AttributeError:
             with utils.smart_open(self.source) as fin:
                 for line in fin:
                     k = utils.to_unicode(line.rstrip()).split("\t")
                     yield k[self.cont_col :], k[self.sent_col], k[self.cat_col]
     else:
         if isinstance(self.source, list):
             split_files = self.source
         else:
             split_files = glob.glob(self.source + ".[a-z][a-z]")
         if self.rand:
             random.shuffle(split_files)
         for source in split_files:
             with utils.smart_open(source) as fin:
                 for line in fin:
                     k = utils.to_unicode(line.rstrip()).split("\t")
                     yield k[self.cont_col :], k[self.sent_col], k[self.cat_col]

Exemplo n.º 30

0

Exibir arquivo

Arquivo: word2vec.py Projeto: nathan2718/category2vec

 def __iter__(self):
     for fname in os.listdir(self.dirname):
         fname = os.path.join(self.dirname, fname)
         if not os.path.isfile(fname):
             continue
         for line in utils.smart_open(fname):
             line = utils.to_unicode(line)
             # each file line is a single sentence in the Brown corpus
             # each token is WORD/POS_TAG
             token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2]
             # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff)
             words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()]
             if not words:  # don't bother sending out empty sentences
                 continue
             yield words

Exemplo n.º 31

0

Exibir arquivo

def iterrules(path, transform):
    fi = smart_open(path)
    for line in fi:
        line = line.strip()
        if not line:
            continue
        fields = line.split()
        lhs = fields[0]
        num, den = fields[-1].split('/')
        num = float(num)
        den = float(den)
        rhs = fields[
            1:-2]  # fields[-2] is the yield function, which we are ignoring
        yield Rule(make_nonterminal(lhs), [make_nonterminal(s) for s in rhs],
                   transform(num / den))

Exemplo n.º 32

0

Exibir arquivo

Arquivo: doc2vec.py Projeto: psr6275/semi-Doc2vec

 def __iter__(self):
     for fname in os.listdir(self.dirname):
         fname = os.path.join(self.dirname, fname)
         if not os.path.isfile(fname):
             continue
         for item_no, line in enumerate(utils.smart_open(fname)):
             line = utils.to_unicode(line)
             # each file line is a single document in the Brown corpus
             # each token is WORD/POS_TAG
             token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2]
             # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff)
             words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()]
             if not words:  # don't bother sending out empty documents
                 continue
             yield TaggedDocument(words, ['%s_SENT_%s' % (fname, item_no)])

Exemplo n.º 33

0

Exibir arquivo

 def __iter__(self):
     try:
         self.source.seek(0)
         for line in self.source:
             k = utils.to_unicode(line.rstrip()).split("\t")
             categories = k[3].split(" ")
             for cat in categories:
                 if "/" in cat: continue
                 yield k[4:], k[1], cat
     except AttributeError:
         with utils.smart_open(self.source) as fin:
             for line in fin:
                 k = utils.to_unicode(line.rstrip()).split("\t")
                 categories = k[3].split(" ")
                 for cat in categories:
                     if "/" in cat: continue
                     yield k[4:], k[1], cat

Exemplo n.º 34

0

Exibir arquivo

Arquivo: word2vec.py Projeto: nathan2718/category2vec

 def __iter__(self):
     # the entire corpus is one gigantic line -- there are no sentence marks at all
     # so just split the sequence of tokens arbitrarily: 1 sentence = 1000 tokens
     sentence, rest, max_sentence_length = [], b'', 1000
     with utils.smart_open(self.fname) as fin:
         while True:
             text = rest + fin.read(8192)  # avoid loading the entire file (=1 line) into RAM
             if text == rest:  # EOF
                 sentence.extend(rest.split()) # return the last chunk of words, too (may be shorter/longer)
                 if sentence:
                     yield sentence
                 break
             last_token = text.rfind(b' ')  # the last token may have been split in two... keep it for the next iteration
             words, rest = (utils.to_unicode(text[:last_token]).split(), text[last_token:].strip()) if last_token >= 0 else ([], text)
             sentence.extend(words)
             while len(sentence) >= max_sentence_length:
                 yield sentence[:max_sentence_length]
                 sentence = sentence[max_sentence_length:]

Exemplo n.º 35

0

Exibir arquivo

Arquivo: doc2vec.py Projeto: jmomarty/NeuralLandPirates

    def save_doc2vec_format(self, fname):
        """
        Store the input-hidden weight matrix in the same format used by the original
        C word2vec-tool, for compatibility.

        """
        logger.info("storing %sx%s projection weights into %s" %
                    (self.sents_len, self.layer1_size, fname))
        assert (self.sents_len, self.layer1_size) == self.sents.shape
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(utils.to_utf8("%s %s\n" % self.sents.shape))
            # store in sorted order: most frequent words at the top
            for sent_no in xrange(self.sents_len):
                row = self.sents[sent_no]
                fout.write(
                    utils.to_utf8("sent_%d %s\n" %
                                  (sent_no, ' '.join("%f" % val
                                                     for val in row))))

Exemplo n.º 36

0

Exibir arquivo

def main():
    utils.unicodefy_std_io()

    parser = argparse.ArgumentParser(
        description='Generates a pair of DSA keys')
    parser.add_argument('--version',
                        action='version',
                        version='DSA Key Generator {}'.format(__version__))
    parser.add_argument(
        '-o',
        '--output',
        default=utils.STD_IO_MARK,
        help=
        'where to save the generated keys pair, default "%(default)s" means print keys to stdout'
    )
    parser.add_argument(
        '-b',
        '--bits',
        type=int,
        default=1024,
        help=
        'the length of the prime to be generated in bits (default: %(default)s)'
    )
    parser.add_argument(
        '--cipher',
        help=
        'name of symmetric key algorithm and mode to encrypt the private key, such as aes_128_cbc'
    )
    parser.add_argument(
        '--passphrase',
        help=
        'a password used to protect the private key when using `cipher`. If not given, '
        'you might be asked to enter password during generation process.')

    unicode_args = map(lambda s: unicode(s, sys.getfilesystemencoding()),
                       sys.argv)
    args = parser.parse_args(unicode_args[1:])

    dsa_key = DsaKey(args.bits)
    with utils.smart_open(args.output, mode='wb') as f:
        f.write(
            dsa_key.get_private_key(cipher=args.cipher,
                                    pass_phrase=args.passphrase))
        f.write(dsa_key.get_public_key())

Exemplo n.º 37

0

Exibir arquivo

Arquivo: sentences.py Projeto: rakutentech/category2vec

 def __iter__(self):
     try:
         self.source.seek(0)
         for line in self.source:
             k = utils.to_unicode(line.rstrip()).split("\t")
             categories = k[3].split(" ")
             for cat in categories:
                 if "/" in cat:
                     continue
                 yield k[4:], k[1], cat
     except AttributeError:
         with utils.smart_open(self.source) as fin:
             for line in fin:
                 k = utils.to_unicode(line.rstrip()).split("\t")
                 categories = k[3].split(" ")
                 for cat in categories:
                     if "/" in cat:
                         continue
                     yield k[4:], k[1], cat

Exemplo n.º 38

0

Exibir arquivo

    def save_cat2vec_format(self, fname):
        """
        Store cat vectors

        """
        logger.info("storing %sx%s projection weights into %s" %
                    (self.cat_len, self.layer1_size, fname))
        assert (self.cat_len, self.layer1_size) == self.cats.shape
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(
                utils.to_utf8("#cats_len: %d\n#size:%d\n" % self.cats.shape))
            fout.write(
                utils.to_utf8(
                    "#sg:%d\n#hs:%d\n#negative:%d\n#cbow_mean:%d\n" %
                    (self.sg, self.hs, self.negative, self.cbow_mean)))
            for cat_id in self.cat_no_hash.keys():
                row = self.cats[self.cat_no_hash[cat_id]]
                fout.write(
                    utils.to_utf8("%s\t%s\n" %
                                  (cat_id, ' '.join("%f" % val
                                                    for val in row))))

Exemplo n.º 39

0

Exibir arquivo

    def save_as_text(self, fname, sort_by_word=True):
        """
        Save this Dictionary to a text file, in format:
        `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Sorted by word,
        or by decreasing word frequency.

        Note: text format should be use for corpus inspection. Use `save`/`load`
        to store in binary format (pickle) for improved performance.
        """
        logger.info("saving dictionary mapping to %s", fname)
        with utils.smart_open(fname, 'wb') as fout:
            if sort_by_word:
                for token, tokenid in sorted(iteritems(self.token2id)):
                    line = "%i\t%s\t%i\n" % (tokenid, token,
                                             self.dfs.get(tokenid, 0))
                    fout.write(utils.to_utf8(line))
            else:
                for tokenid, freq in sorted(iteritems(self.dfs),
                                            key=lambda item: -item[1]):
                    line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq)
                    fout.write(utils.to_utf8(line))

Exemplo n.º 40

0

Exibir arquivo

 def load_from_text(fname):
     """
     Load a previously stored Dictionary from a text file.
     Mirror function to `save_as_text`.
     """
     result = Dictionary()
     with utils.smart_open(fname) as f:
         for lineno, line in enumerate(f):
             line = utils.to_unicode(line)
             try:
                 wordid, word, docfreq = line[:-1].split('\t')
             except Exception:
                 raise ValueError("invalid line in dictionary file %s: %s" %
                                  (fname, line.strip()))
             wordid = int(wordid)
             if word in result.token2id:
                 raise KeyError(
                     'token %s is defined as ID %d and as ID %d' %
                     (word, wordid, result.token2id[word]))
             result.token2id[word] = wordid
             result.dfs[wordid] = int(docfreq)
     return result

Exemplo n.º 41

0

Exibir arquivo

def main():
    utils.unicodefy_std_io()

    parser = argparse.ArgumentParser(
        description='Generates an atlassian license')
    parser.add_argument(
        '--version',
        action='version',
        version='Atlassian License Generator {}'.format(__version__))

    parser.add_argument(
        'template',
        help=
        'path to a license template yaml file, such as `templates/jira.yml`')
    parser.add_argument('organisation',
                        help='your company name used to register the product')
    parser.add_argument(
        'server_id',
        nargs='?',
        help='server id, usually in format of `ABCD-1234-EFGH-5678`')

    parser.add_argument(
        '-o',
        '--output',
        default=utils.STD_IO_MARK,
        help='where to save the generated license, default "%(default)s" means '
        'stdout')
    parser.add_argument(
        '--show-raw',
        action='store_true',
        help='also prints raw (not encoded) license content to stderr')
    parser.add_argument(
        '-k',
        '--key',
        '--private-key',
        default='calfzhou.pem',
        help='a key file (contains at least a private DSA key) used to sign the '
        'license (default: %(default)s)')
    parser.add_argument(
        '--passphrase',
        help=
        'password used by the private key. If not given, you might be asked '
        'to enter it when needed.')

    def parse_variable_definition(text):
        parts = text.split('=', 1)
        if len(parts) < 2:
            raise argparse.ArgumentTypeError(
                'unrecognized variable definition "{}"'.format(text))
        return tuple(parts)

    group = parser.add_argument_group(
        'customizing license arguments',
        'use these arguments to over-write default license template '
        'or variables')

    group.add_argument(
        '-v',
        '--var',
        action='append',
        type=parse_variable_definition,
        help='custom variable used by template, e.g. -v number_of_users=200')

    unicode_args = map(lambda s: unicode(s, sys.getfilesystemencoding()),
                       sys.argv)
    args = parser.parse_args(unicode_args[1:])

    custom_variables = None
    if args.var:
        custom_variables = {key: value for key, value in args.var}

    generator = AtlanssianLicenseGenerator(args.template)
    atlassian_license = generator.generate(args.organisation, args.server_id,
                                           custom_variables)

    if args.show_raw:
        print(atlassian_license, file=sys.stderr)

    encoder = AtlassianLicenseEncoder(args.key, args.passphrase)
    encoded_license = encoder.encode(atlassian_license)

    with utils.smart_open(args.output, mode='wb') as f:
        f.write(encoded_license)

Exemplo n.º 42

0

Exibir arquivo

Arquivo: matutils.py Projeto: tomkocmi/SubGram

 def __init__(self, fname):
     self.fname = fname
     if fname.endswith(".gz") or fname.endswith('.bz2'):
         raise NotImplementedError("compressed output not supported with MmWriter")
     self.fout = utils.smart_open(self.fname, 'wb+') # open for both reading and writing
     self.headers_written = False

Exemplo n.º 43

0

Exibir arquivo

Arquivo: word2vec.py Projeto: nathan2718/category2vec

    def load_word2vec_format(cls, fname, fvocab=None, binary=False, norm_only=True):
        """
        Load the input-hidden weight matrix from the original C word2vec-tool format.

        Note that the information stored in the file is incomplete (the binary tree is missing),
        so while you can query for word similarity etc., you cannot continue training
        with a model loaded this way.

        `binary` is a boolean indicating whether the data is in binary word2vec format.
        `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.
        Word counts are read from `fvocab` filename, if set (this is the file generated
        by `-save-vocab` flag of the original C tool).
        """
        counts = None
        if fvocab is not None:
            logger.info("loading word counts from %s" % (fvocab))
            counts = {}
            with utils.smart_open(fvocab) as fin:
                for line in fin:
                    word, count = utils.to_unicode(line).strip().split()
                    counts[word] = int(count)

        logger.info("loading projection weights from %s" % (fname))
        with utils.smart_open(fname) as fin:
            header = utils.to_unicode(fin.readline())
            vocab_size, layer1_size = map(int, header.split())  # throws for invalid file format
            result = Word2Vec(size=layer1_size)
            result.syn0 = zeros((vocab_size, layer1_size), dtype=REAL)
            if binary:
                binary_len = dtype(REAL).itemsize * layer1_size
                for line_no in xrange(vocab_size):
                    # mixed text and binary: read text first, then binary
                    word = []
                    while True:
                        ch = fin.read(1)
                        if ch == b' ':
                            break
                        if ch != b'\n':  # ignore newlines in front of words (some binary files have newline, some don't)
                            word.append(ch)
                    word = utils.to_unicode(b''.join(word))
                    if counts is None:
                        result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no)
                    elif word in counts:
                        result.vocab[word] = Vocab(index=line_no, count=counts[word])
                    else:
                        logger.warning("vocabulary file is incomplete")
                        result.vocab[word] = Vocab(index=line_no, count=None)
                    result.index2word.append(word)
                    result.syn0[line_no] = fromstring(fin.read(binary_len), dtype=REAL)
            else:
                for line_no, line in enumerate(fin):
                    parts = utils.to_unicode(line).split()
                    if len(parts) != layer1_size + 1:
                        raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no))
                    word, weights = parts[0], map(REAL, parts[1:])
                    if counts is None:
                        result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no)
                    elif word in counts:
                        result.vocab[word] = Vocab(index=line_no, count=counts[word])
                    else:
                        logger.warning("vocabulary file is incomplete")
                        result.vocab[word] = Vocab(index=line_no, count=None)
                    result.index2word.append(word)
                    result.syn0[line_no] = weights
        logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname))
        result.init_sims(norm_only)
        return result

Exemplo n.º 44

0

Exibir arquivo

Arquivo: word2vec.py Projeto: nathan2718/category2vec

    def accuracy(self, questions, restrict_vocab=30000):
        """
        Compute accuracy of the model. `questions` is a filename where lines are
        4-tuples of words, split into sections by ": SECTION NAME" lines.
        See https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt for an example.

        The accuracy is reported (=printed to log and returned as a list) for each
        section separately, plus there's one aggregate summary at the end.

        Use `restrict_vocab` to ignore all questions containing a word whose frequency
        is not in the top-N most frequent words (default top 30,000).

        This method corresponds to the `compute-accuracy` script of the original C word2vec.

        """
        ok_vocab = dict(sorted(iteritems(self.vocab),
                               key=lambda item: -item[1].count)[:restrict_vocab])
        ok_index = set(v.index for v in itervalues(ok_vocab))

        def log_accuracy(section):
            correct, incorrect = section['correct'], section['incorrect']
            if correct + incorrect > 0:
                logger.info("%s: %.1f%% (%i/%i)" %
                    (section['section'], 100.0 * correct / (correct + incorrect),
                    correct, correct + incorrect))

        sections, section = [], None
        for line_no, line in enumerate(utils.smart_open(questions)):
            # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
            line = utils.to_unicode(line)
            if line.startswith(': '):
                # a new section starts => store the old section
                if section:
                    sections.append(section)
                    log_accuracy(section)
                section = {'section': line.lstrip(': ').strip(), 'correct': 0, 'incorrect': 0}
            else:
                if not section:
                    raise ValueError("missing section header before line #%i in %s" % (line_no, questions))
                try:
                    a, b, c, expected = [word.lower() for word in line.split()]  # TODO assumes vocabulary preprocessing uses lowercase, too...
                except:
                    logger.info("skipping invalid line #%i in %s" % (line_no, questions))
                if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
                    logger.debug("skipping line #%i with OOV words: %s" % (line_no, line))
                    continue

                ignore = set(self.vocab[v].index for v in [a, b, c])  # indexes of words to ignore
                predicted = None
                # find the most likely prediction, ignoring OOV words and input words
                for index in argsort(self.most_similar(positive=[b, c], negative=[a], topn=False))[::-1]:
                    if index in ok_index and index not in ignore:
                        predicted = self.index2word[index]
                        if predicted != expected:
                            logger.debug("%s: expected %s, predicted %s" % (line.strip(), expected, predicted))
                        break
                section['correct' if predicted == expected else 'incorrect'] += 1
        if section:
            # store the last section, too
            sections.append(section)
            log_accuracy(section)

        total = {'section': 'total', 'correct': sum(s['correct'] for s in sections), 'incorrect': sum(s['incorrect'] for s in sections)}
        log_accuracy(total)
        sections.append(total)
        return sections

Exemplo n.º 45

0

Exibir arquivo

Arquivo: word2vec.py Projeto: chen-eleven/Synonyms

    def load_word2vec_format(
            cls,
            fname,
            fvocab=None,
            binary=False,
            encoding='utf8',
            unicode_errors='strict',
            limit=None,
            datatype=REAL):
        """
        Load the input-hidden weight matrix from the original C word2vec-tool format.
        Note that the information stored in the file is incomplete (the binary tree is missing),
        so while you can query for word similarity etc., you cannot continue training
        with a model loaded this way.
        `binary` is a boolean indicating whether the data is in binary word2vec format.
        `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.
        Word counts are read from `fvocab` filename, if set (this is the file generated
        by `-save-vocab` flag of the original C tool).
        If you trained the C model using non-utf8 encoding for words, specify that
        encoding in `encoding`.
        `unicode_errors`, default 'strict', is a string suitable to be passed as the `errors`
        argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source
        file may include word tokens truncated in the middle of a multibyte unicode character
        (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help.
        `limit` sets a maximum number of word-vectors to read from the file. The default,
        None, means read all.
        `datatype` (experimental) can coerce dimensions to a non-default float type (such
        as np.float16) to save memory. (Such types may result in much slower bulk operations
        or incompatibility with optimized routines.)
        """
        counts = None
        if fvocab is not None:
            print("loading word counts from %s" % fvocab)
            counts = {}
            with utils.smart_open(fvocab) as fin:
                for line in fin:
                    word, count = utils.to_unicode(line).strip().split()
                    counts[word] = int(count)

        print("loading projection weights from %s" % fname)
        with utils.smart_open(fname) as fin:
            header = utils.to_unicode(fin.readline(), encoding=encoding)
            # throws for invalid file format
            vocab_size, vector_size = (int(x) for x in header.split())
            if limit:
                vocab_size = min(vocab_size, limit)
            result = cls()
            result.vector_size = vector_size
            result.syn0 = zeros((vocab_size, vector_size), dtype=datatype)

            def add_word(word, weights):
                word_id = len(result.vocab)
                # print("word id: %d, word: %s, weights: %s" % (word_id, word, weights))
                if word in result.vocab:
                    print(
                        "duplicate word '%s' in %s, ignoring all but first" %
                        (word, fname))
                    return
                if counts is None:
                    # most common scenario: no vocab file given. just make up
                    # some bogus counts, in descending order
                    result.vocab[word] = Vocab(
                        index=word_id, count=vocab_size - word_id)
                elif word in counts:
                    # use count from the vocab file
                    result.vocab[word] = Vocab(
                        index=word_id, count=counts[word])
                else:
                    # vocab file given, but word is missing -- set count to
                    # None (TODO: or raise?)
                    print(
                        "vocabulary file is incomplete: '%s' is missing" %
                        word)
                    result.vocab[word] = Vocab(index=word_id, count=None)
                result.syn0[word_id] = weights
                result.index2word.append(word)

            if binary:
                binary_len = dtype(REAL).itemsize * vector_size
                for _ in xrange(vocab_size):
                    # mixed text and binary: read text first, then binary
                    word = []
                    while True:
                        ch = fin.read(1)
                        if ch == b' ':
                            break
                        if ch == b'':
                            raise EOFError(
                                "unexpected end of input; is count incorrect or file otherwise damaged?")
                        # ignore newlines in front of words (some binary files
                        # have)
                        if ch != b'\n':
                            word.append(ch)
                    word = utils.to_unicode(
                        b''.join(word), encoding=encoding, errors=unicode_errors)
                    weights = fromstring(fin.read(binary_len), dtype=REAL)
                    add_word(word, weights)
            else:
                for line_no in xrange(vocab_size):
                    line = fin.readline()
                    if line == b'':
                        raise EOFError(
                            "unexpected end of input; is count incorrect or file otherwise damaged?")
                    parts = utils.to_unicode(
                        line.rstrip(),
                        encoding=encoding,
                        errors=unicode_errors).split(" ")
                    if len(parts) != vector_size + 1:
                        raise ValueError(
                            "invalid vector on line %s (is this really the text format?)" %
                            line_no)
                    word, weights = parts[0], [REAL(x) for x in parts[1:]]
                    add_word(word, weights)
        if result.syn0.shape[0] != len(result.vocab):
            print(
                "duplicate words detected, shrinking matrix size from %i to %i" %
                (result.syn0.shape[0], len(result.vocab)))
            result.syn0 = ascontiguousarray(result.syn0[: len(result.vocab)])
        assert (len(result.vocab), vector_size) == result.syn0.shape

        print("loaded %s matrix from %s" % (result.syn0.shape, fname))
        return result

Exemplo n.º 46

0

Exibir arquivo

Arquivo: word2vec.py Projeto: zhangyuankai2018/QAbot_by_base_KG

    def load_word2vec_format(cls,
                             fname,
                             fvocab=None,
                             binary=False,
                             encoding='utf8',
                             unicode_errors='strict',
                             limit=None,
                             datatype=REAL):
        """
        Load the input-hidden weight matrix from the original C word2vec-tool format.
        Note that the information stored in the file is incomplete (the binary tree is missing),
        so while you can query for word similarity etc., you cannot continue training
        with a model loaded this way.
        `binary` is a boolean indicating whether the data is in binary word2vec format.
        `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.
        Word counts are read from `fvocab` filename, if set (this is the file generated
        by `-save-vocab` flag of the original C tool).
        If you trained the C model using non-utf8 encoding for words, specify that
        encoding in `encoding`.
        `unicode_errors`, default 'strict', is a string suitable to be passed as the `errors`
        argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source
        file may include word tokens truncated in the middle of a multibyte unicode character
        (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help.
        `limit` sets a maximum number of word-vectors to read from the file. The default,
        None, means read all.
        `datatype` (experimental) can coerce dimensions to a non-default float type (such
        as np.float16) to save memory. (Such types may result in much slower bulk operations
        or incompatibility with optimized routines.)
        """
        counts = None
        if fvocab is not None:
            logging.debug("loading word counts from %s" % fvocab)
            counts = {}
            with utils.smart_open(fvocab) as fin:
                for line in fin:
                    word, count = utils.to_unicode(line).strip().split()
                    counts[word] = int(count)

        logging.debug("loading projection weights from %s" % fname)
        with utils.smart_open(fname) as fin:
            header = utils.to_unicode(fin.readline(), encoding=encoding)
            # throws for invalid file format
            vocab_size, vector_size = (int(x) for x in header.split())
            if limit:
                vocab_size = min(vocab_size, limit)
            result = cls()
            result.vector_size = vector_size
            result.syn0 = zeros((vocab_size, vector_size), dtype=datatype)

            def add_word(word, weights):
                word_id = len(result.vocab)
                # logging.debug("word id: %d, word: %s, weights: %s" % (word_id, word, weights))
                if word in result.vocab:
                    logging.debug(
                        "duplicate word '%s' in %s, ignoring all but first" %
                        (word, fname))
                    return
                if counts is None:
                    # most common scenario: no vocab file given. just make up
                    # some bogus counts, in descending order
                    result.vocab[word] = Vocab(index=word_id,
                                               count=vocab_size - word_id)
                elif word in counts:
                    # use count from the vocab file
                    result.vocab[word] = Vocab(index=word_id,
                                               count=counts[word])
                else:
                    # vocab file given, but word is missing -- set count to
                    # None (TODO: or raise?)
                    logging.debug(
                        "vocabulary file is incomplete: '%s' is missing" %
                        word)
                    result.vocab[word] = Vocab(index=word_id, count=None)
                result.syn0[word_id] = weights
                result.index2word.append(word)

            if binary:
                binary_len = dtype(REAL).itemsize * vector_size
                for _ in xrange(vocab_size):
                    # mixed text and binary: read text first, then binary
                    word = []
                    while True:
                        ch = fin.read(1)
                        if ch == b' ':
                            break
                        if ch == b'':
                            raise EOFError(
                                "unexpected end of input; is count incorrect or file otherwise damaged?"
                            )
                        # ignore newlines in front of words (some binary files
                        # have)
                        if ch != b'\n':
                            word.append(ch)
                    word = utils.to_unicode(b''.join(word),
                                            encoding=encoding,
                                            errors=unicode_errors)
                    weights = fromstring(fin.read(binary_len), dtype=REAL)
                    add_word(word, weights)
            else:
                for line_no in xrange(vocab_size):
                    line = fin.readline()
                    if line == b'':
                        raise EOFError(
                            "unexpected end of input; is count incorrect or file otherwise damaged?"
                        )
                    parts = utils.to_unicode(line.rstrip(),
                                             encoding=encoding,
                                             errors=unicode_errors).split(" ")
                    if len(parts) != vector_size + 1:
                        raise ValueError(
                            "invalid vector on line %s (is this really the text format?)"
                            % line_no)
                    word, weights = parts[0], [REAL(x) for x in parts[1:]]
                    add_word(word, weights)
        if result.syn0.shape[0] != len(result.vocab):
            logging.debug(
                "duplicate words detected, shrinking matrix size from %i to %i"
                % (result.syn0.shape[0], len(result.vocab)))
            result.syn0 = ascontiguousarray(result.syn0[:len(result.vocab)])
        assert (len(result.vocab), vector_size) == result.syn0.shape
        '''
        KDTree
        Build KDTree with vectors.
        http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KDTree.html#sklearn.neighbors.KDTree
        '''
        result.kdt = KDTree(result.syn0, leaf_size=10, metric="euclidean")
        logging.debug("loaded %s matrix from %s" % (result.syn0.shape, fname))
        return result

Exemplo n.º 47

0

Exibir arquivo

    def get_tokens(self, words):
        for i in range(1, len(self.line)):
            if not words.get(self.line[i - 1]):
                words[self.line[i - 1]] = {self.line[i]: 1}
            elif not words[self.line[i - 1]].get(self.line[i]):
                words[self.line[i - 1]][self.line[i]] = 1
            else:
                words[self.line[i - 1]][self.line[i]] += 1
        return words


parser = argparse.ArgumentParser(
    description='A script which collects words from file')
parser.add_argument('--input-dir',
                    dest='directory',
                    type=str,
                    default='stdin',
                    help='File directory')
parser.add_argument('--model', required=True, type=str, help='Save file')
parser.add_argument('--lc', action='store_true', help='Switch to lowercase')
args = parser.parse_args()

if __name__ == '__main__':
    words = {}
    with utils.smart_open(args.directory, "r") as fin:
        for line in utils.all_files_generator(fin):
            p = Parser(line)
            p.preprocess(args.lc)
            p.get_tokens(words)
    utils.dump_dictionary(args.model, words)