def load_vectors(cls, filename):
        '''
    Load word vectors from a file
    
    Args:
      filename: the name of the file that contains the word vectors
        Comment lines are started with #
        If the first line except comments contains only two integers, it's
        assumed that the first is the vocabulary size and the second is the 
        word embedding size (the same as word2vec).
        
    Return:
      a class of word vectors
    '''
        at_beginning = True
        with Reader(filename) as f:
            idx = 1  # 0 for OOV

            vectors = [[0]]  # placeholder for OOV
            word2id = {'OOV': 0}

            for line in f:
                if line.startswith('#'):
                    continue

                if at_beginning:
                    at_beginning = False
                    parts = line.strip().split()
                    if len(parts) == 2:
                        embsize = int(parts[1])
                        oov = np.zeros(embsize)
                    else:
                        word = parts[0]
                        vec = np.array([float(v) for v in parts[1:]])
                        embsize = len(vec)
                        oov = np.zeros(embsize)
                        oov += vec
                        vectors.append(vec)
                        word2id[word] = idx
                        idx += 1
                else:
                    parts = line.strip().split(' ')
                    word = parts[0]
                    vec = np.array([float(v) for v in parts[1:]])
                    assert (vec.size == embsize)
                    oov += vec
                    vectors.append(vec)
                    word2id[word] = idx
                    idx += 1

            oov = oov / (len(vectors) - 1)
            vectors[0] = oov

            word_vectors = WordVectors(embsize)
            word_vectors._vectors = np.array(vectors).T
            word_vectors._word2id = word2id

            return word_vectors
示例#2
0
def prepare_data(word_vectors=None, datafile=None):
    '''Prepare training data
  Args:
    word_vectors: an instance of vec.wordvector
    datafile: location of data file
    
  Return:
    instances: a list of Instance
    word_vectors: word_vectors
    total_internal_node: total number of internal nodes
  '''
    if rank == 0:
        # broadcast word vectors
        comm.bcast(word_vectors, root=0)

        # load raw data
        with Reader(datafile) as datafile:
            instance_strs = [line for line in datafile]

        # send training data
        instance_num = len(instance_strs)
        esize = int(instance_num / worker_num + 0.5)
        sizes = [esize] * worker_num
        sizes[-1] = instance_num - esize * (worker_num - 1)
        offset = sizes[0]
        for i in range(1, worker_num):
            comm.send(instance_strs[offset:offset + sizes[i]], dest=i)
            offset += sizes[i]
        comm.barrier()

        local_instance_strs = instance_strs[0:sizes[0]]
        del instance_strs

        instances, internal_node_num = load_instances(local_instance_strs,
                                                      word_vectors)
        total_internal_node = comm.allreduce(internal_node_num, op=MPI.SUM)
        return instances, word_vectors, total_internal_node
    else:
        word_vectors = comm.bcast(root=0)

        # receive data
        local_instance_strs = comm.recv(source=0)
        comm.barrier()

        instances, internal_node_num = load_instances(local_instance_strs,
                                                      word_vectors)
        total_internal_node = comm.allreduce(internal_node_num, op=MPI.SUM)
        return instances, word_vectors, total_internal_node
def prepare_data(word_vectors=None, datafile=None):
    '''
    Prepare training data
    Args:
        word_vectors: an instance of vec.wordvector
        datafile: location of data file
    
    Return:
        instances: a list of Instance
        word_vectors: word_vectors
        total_internal_node: total number of internal nodes
    '''

    # load raw data
    with Reader(datafile) as datafile:
        instance_strs = [line for line in datafile]

    # send training data
    instance_num = len(instance_strs)
    # esize = int(instance_num+0.5)

    instances, internal_node_num = load_instances(instance_strs, word_vectors)
    return instances, word_vectors, internal_node_num
示例#4
0
    def __init__(self, filename):
        with Reader(filename) as reader:
            config_lines = [
                re.sub(ur'^([^ \[]*) ', ur'\1=', line) for line in reader
            ]
        config_str = u''.join(config_lines)

        config = ConfigParser.ConfigParser()
        config.readfp(io.StringIO(config_str))

        # decide feature orders
        order = {
            'lex-sgt': 0,
            'lex-tgs': 1,
            'trans-sgt': 2,
            'trans-tgs': 3,
            'word-count': 4,
            'rule-count': 5,
            'glue-rule-count': 6,
            'lm': 7,
        }

        # set other feature order
        #if config.has_option('switch', 'use-nn-feature'):
        #  self.use_neural_feature = config.getboolean('switch', 'use-nn-feature')
        #else:
        #  self.use_neural_feature = False
        #if self.use_neural_feature:
        #  order['nn-feature'] = len(order)
        #  if not config.has_section(self.NN_SECTION):
        #    raise NoSectionError('section "%s" is absent' % self.NN_SECTION)

        order['oov'] = len(order)  # it should always be the last feature

        # load weights
        all_feature_names = set(order.keys())
        weights = [0] * len(order)
        for key, value in config.items('weights'):
            # DO NOT load useless weights
            #if key == 'nn-feature' and not self.use_neural_feature:
            #  continue
            if key == 'base':  # skip [DEFAULT]
                continue
            weights[order[key]] = float(value)
            all_feature_names.remove(key)

        weights[order['oov']] = -100
        all_feature_names.remove('oov')
        if len(all_feature_names) != 0:
            msg = 'weight(s) absent for feature(s): %s' % all_feature_names
            logger.error(msg)
            raise AbsentWeightError(msg)

        self.order = order
        self.weights = weights
        logger.info('weights : %s' % self.weights)

        # load common parameters
        self.x_beta = config.getfloat('param', 'X-beta')
        self.x_beamsize = config.getint('param', 'X-beamsize')

        self.s_beta = config.getfloat('param', 'S-beta')
        self.s_beamsize = config.getint('param', 'S-beamsize')

        self.rule_beamsize = config.getint('param', 'rule-beamsize')

        self.max_X_len = config.getint('param', 'max-X-len')

        self.epsilon = config.getint('param', 'epsilon')

        # load rule table
        self.rule_table_file = config.get('data', 'rules')

        # load language model data
        self.lm_file = config.get('data', 'lm-file')
        self.lm_order = config.getint('data', 'lm-order')

        self.enable_type3_glue_rule = False

        self.raw_config = config
示例#5
0
    def __load_rules(cls, filename, lm, config):
        '''
    Load rule table from filename
    
    Args:
      filename: the name of the file that stores the rules
      lm: language model
      config: an instance of Config
      
    Return:
      a RuleTable
    '''

        feature_num = config.get_feature_num()
        glue_rule_index = config.order['glue-rule-count']
        max_rule_num = config.rule_beamsize

        table = RuleTable()
        keys = []
        ranges = []

        idx = 0
        # glue rules
        # S -> X
        # type 1 glue rule is not counted
        features = [0] * feature_num
        glue_rule1 = Rule('|0', ['|0'], [0], features,
                          cls.GLUE_RULE1_GLOBAL_ID)
        table._rules.append(glue_rule1)
        keys.append(cls.GLUE_RULE1.decode('utf-8'))
        ranges.append((idx, idx))
        idx += 1

        # S -> S X
        features = [0] * feature_num
        features[glue_rule_index] = 1
        glue_rule2 = Rule('|0 |1', ['|0', '|1'], [0, 1], features,
                          cls.GLUE_RULE2_GLOBAL_ID)
        glue_rule2.score = config.weights[config.order['glue-rule-count']]
        table._rules.append(glue_rule2)
        idx += 1

        if config.enable_type3_glue_rule:
            # S -> <S X; X S>
            features = [0] * feature_num
            features[glue_rule_index] = 1
            glue_rule3 = Rule('|0 |1', ['|1', '|0'], [1, 0], features,
                              cls.GLUE_RULE3_GLOBAL_ID)
            glue_rule3.score = config.weights[config.order['glue-rule-count']]
            table._rules.append(glue_rule3)
            idx += 1

        keys.append(cls.GLUE_RULE2.decode('utf-8'))
        ranges.append((1, idx - 1))

        table.glue_rule_ids = tuple(i for i in range(idx))

        # normal rules
        with Reader(filename) as reader:
            last_src = None
            current_rules = []
            for rule_str in reader:
                parts = rule_str.strip().split(' ||| ')
                src = parts[0]
                tgt = parts[1].split(' ')
                nonterminal_pos = []
                for tword, pos in zip(tgt, range(len(tgt))):
                    if tword[0] == '|':
                        if len(nonterminal_pos) == 0:
                            nonterminal_pos.append(pos)
                        else:
                            index = int(tword[1:])
                            nonterminal_pos.insert(index, pos)
                features = [float(f) for f in parts[2].split(' ')]
                features.append(len(tgt) - len(nonterminal_pos))  # word number
                features.append(1)  # rule count
                features.append(0)  # glue rule count
                if len(parts) >= 4:
                    global_rule_id = int(parts[3])
                    rule = Rule(src, tgt, nonterminal_pos, features,
                                global_rule_id)
                else:
                    rule = Rule(src, tgt, nonterminal_pos, features, idx)
                lmscore, hlmscore = cls.__get_lm_scores(rule, lm)
                features.append(lmscore)  # lm score
                rule.hlmscore = hlmscore

                if last_src == None or src == last_src:
                    current_rules.append(rule)
                    last_src = src
                else:
                    cls.__update_table(table, keys, ranges, last_src,
                                       current_rules, config, max_rule_num)
                    current_rules = [rule]
                    last_src = src
                idx += 1

            cls.__update_table(table, keys, ranges, last_src, current_rules,
                               config, max_rule_num)

            table._idranges = RecordTrie('<II', zip(keys, ranges))
            del keys
            del ranges
            gc.collect()
        return table
示例#6
0
    word_vectors = WordVectors.load_vectors(word_vector_file)
    embsize = word_vectors.embsize()

    print >> stderr, 'load RAE parameters...'
    theta = unpickle(theta_file)
    rae = RecursiveAutoencoder.build(theta, embsize)

    total_cost = 0
    total_instance_num = 0
    total_internal_node_num = 0

    print '=' * 63
    print '%20s %20s %20s' % ('all', 'avg/node', 'internal node')
    print '-' * 63

    with Reader(phrases_file) as reader, Writer(output_file) as writer:
        for phrase in reader:
            instance = Instance.parse_from_str(phrase, word_vectors)
            words_embedded = word_vectors[instance.words]
            root_node, cost = rae.forward(words_embedded)
            vec = root_node.p.T[0]  # convert n*1 vector to common vector
            writer.write(' '.join([str(vec[i]) for i in range(vec.size)]))
            writer.write('\n')

            internal_node_num = len(instance.words) - 1
            if internal_node_num > 0:
                print '%20.8f, %20.8f, %18d' % (cost, cost / internal_node_num,
                                                internal_node_num)
            else:
                print '%20.8f, %20.8f, %18d' % (cost, cost, 0)
示例#7
0
        logging.config.dictConfig(config_)

    k = options.kbest
    drop_oov = options.drop_oov
    debug = options.debug
    output_features = options.features
    checking = options.checking
    expend_loser = options.expend_loser
    with_rule_tree = options.with_rule_tree
    threads = options.threads
    logger.info('process num: %d' % threads)

    if options.input == '-':
        source = sys.stdin  # TODO encoding
    else:
        source = Reader(options.input)

    if options.output == '-':
        writer = sys.stdout
    else:
        writer = Writer(options.output)

    if debug:
        rules.DEBUG = 1

    config = Config(options.config)
    if logger.level <= logging.INFO:
        config.write(sys.stderr)

    lm = LanguageModel(config.lm_file, config.lm_order)
    rule_table = RuleTable.load(config.rule_table_file, lm, config)
   logging.config.dictConfig(config_)
 
 k = options.kbest
 drop_oov = options.drop_oov
 debug = options.debug
 output_features = options.features
 checking = options.checking
 expend_loser = options.expend_loser
 with_rule_tree = options.with_rule_tree
 threads = options.threads
 logger.info('process num: %d' % threads)
  
 if options.input == '-':
   source = sys.stdin # TODO encoding
 else:
   source = Reader(options.input)
   
 if options.output == '-':
   writer = sys.stdout
 else:
   writer = Writer(options.output)
   
 if debug:
   rules.DEBUG = 1
 
 config = Config(options.config)
 if logger.level <= logging.INFO:
   config.write(sys.stderr)
   
 lm = LanguageModel(config.lm_file, config.lm_order)
 rule_table = RuleTable.load(config.rule_table_file, lm, config)