Exemplo n.º 1
0
    def translate_maxibatch(maxibatch,
                            num_to_target,
                            num_prev_translated,
                            mask=0):
        """Translates an individual maxibatch.

        Args:
            maxibatch: a list of sentences.
            num_to_target: dictionary mapping target vocabulary IDs to strings.
            num_prev_translated: the number of previously translated sentences.
        """

        # Sort the maxibatch by length and split into minibatches.
        try:
            minibatches, idxs = util.read_all_lines(config, maxibatch,
                                                    minibatch_size)
        except exception.Error as x:
            logging.error(x.msg)
            sys.exit(1)

        # Translate the minibatches and store the resulting beam (i.e.
        # translations and scores) for each sentence.
        beams = []
        for x in minibatches:
            y_dummy = numpy.zeros(shape=(len(x), 1))
            x, x_mask, _, _ = util.prepare_data(x,
                                                y_dummy,
                                                config.factors,
                                                maxlen=None)
            sample = translate_batch(session, sampler, x, x_mask,
                                     max_translation_len, normalization_alpha)
            beams.extend(sample)
            num_translated = num_prev_translated + len(beams)
            logging.info('Translated {} sents'.format(num_translated))

        # Put beams into the same order as the input maxibatch.
        tmp = numpy.array(beams, dtype=numpy.object)
        ordered_beams = tmp[idxs.argsort()]

        # Write the translations to the output file.
        for i, beam in enumerate(ordered_beams):
            if nbest:
                num = num_prev_translated + i
                for sent, cost in beam:
                    translation = util.seq2words(sent, num_to_target)
                    line = "{} ||| {} ||| {}\n".format(num, translation,
                                                       str(cost))
                    output_file.write(line)
            else:
                best_hypo, cost = beam[0]
                # print(best_hypo)
                eos_idx = list(best_hypo).index(0) if 0 in best_hypo else len(
                    best_hypo)
                best_hypo = best_hypo[:eos_idx]
                best_hypo = best_hypo[:len(best_hypo) -
                                      mask] if len(best_hypo) > mask else []
                best_hypo = list(best_hypo) + [0]
                # print(best_hypo)
                line = util.seq2words(best_hypo, num_to_target) + '\n'
                output_file.write(line)
Exemplo n.º 2
0
    def _send_jobs(self, input_, translation_settings):
        """
        """
        source_batches = []

        try:
            batches, idxs = util.read_all_lines(self._options[0], input_,
                                                self._batch_size)
        except exception.Error as x:
            logging.error(x.msg)
            for process in self._processes:
                process.terminate()
            sys.exit(1)

        for idx, batch in enumerate(batches):

            input_item = QueueItem(
                verbose=self._verbose,
                k=translation_settings.beam_size,
                normalization_alpha=translation_settings.normalization_alpha,
                nbest=translation_settings.n_best,
                batch=batch,
                idx=idx,
                request_id=translation_settings.request_id)

            self._input_queue.put(input_item)
            source_batches.append(batch)
        return idx + 1, source_batches, idxs
def EventCoreference_t():
    '''
    test
    '''
    PATH = u"/home/lzh/Documents/SinoCoreferencer/突发事件/社会安全/topic62"
    testLoc = "/home/lzh/Documents/SinoCoreferencer/test"

    childfiles = []
    for dirpath, dirpathnames, filenames in os.walk(PATH):
        directory = os.path.join(dirpath, 'directory.all')
        dir_lines = read_all_lines(directory)
        for line in dir_lines:
            f_test = open(testLoc, 'w')
            absoulePath = os.path.join(dirpath, line).encode('utf-8')
            print absoulePath
            f_test.write(absoulePath + '\n')
            f_test.flush()

            input_file = line + '.shtml.out'
            each_input = 'topic62_'
            input_dir = os.path.join('/home/lzh/Downloads/data/', each_input,
                                     input_file)
            FileGeneration(dirpath, input_dir, line)

            os.chdir("/home/lzh/Documents/SinoCoreferencer/")
            os.system("./run-coreference.sh test")
            Post_arg(dirpath, input_dir, line)
def InputProcess(inputFile):

    input_lines = read_all_lines(inputFile)

    attr_set = []
    attr = []
    for line in input_lines:
        pos_set = []
        for i in range(len(attr_set)):
            pos_set.append(attr_set[i][2])
        attr = line.strip().split('|')
        word = attr[0].split(',')
        attr[0] = word[0]

        if attr[2] in pos_set:
            for i in range(len(attr_set)):
                if attr[2] == attr_set[i][2]:
                    attr_set[i].append(word[1])
#                     attr_set[i].append(word[2])
        else:
            attr.append(word[1])
            #             attr.append(word[2])
            attr_set.append(attr)


#     for a in attr_set:
#         print a
    return attr_set
Exemplo n.º 5
0
    def translate_maxibatch(maxibatch, model_set, num_to_target,
                            num_prev_translated):
        """Translates an individual maxibatch.

        Args:
            maxibatch: a list of sentences.
            model_set: an InferenceModelSet object.
            num_to_target: dictionary mapping target vocabulary IDs to strings.
            num_prev_translated: the number of previously translated sentences.
        """

        # Sort the maxibatch by length and split into minibatches.
        try:
            pre_minibatches, minibatches, idxs = util.read_all_lines(
                configs[0], maxibatch, minibatch_size)
        except exception.Error as x:
            logging.error(x.msg)
            sys.exit(1)

        # Translate the minibatches and store the resulting beam (i.e.
        # translations and scores) for each sentence.
        beams = []
        for px, x in zip(pre_minibatches, minibatches):
            y_dummy = numpy.zeros(shape=(len(x), 1))
            px, x, x_mask, _, _ = util.prepare_data(x,
                                                    y_dummy,
                                                    configs[0].factors,
                                                    px,
                                                    maxlen=None)
            sample = model_set.decode(session=session,
                                      px=px,
                                      x=x,
                                      x_mask=x_mask,
                                      beam_size=beam_size,
                                      normalization_alpha=normalization_alpha)
            beams.extend(sample)
            num_translated = num_prev_translated + len(beams)
            logging.info('Translated {} sents'.format(num_translated))

        # Put beams into the same order as the input maxibatch.
        tmp = numpy.array(beams, dtype=numpy.object)
        ordered_beams = tmp[idxs.argsort()]

        # Write the translations to the output file.
        for i, beam in enumerate(ordered_beams):
            if nbest:
                num = num_prev_translated + i
                for sent, cost in beam:
                    translation = util.seq2words(sent, num_to_target)
                    line = "{} ||| {} ||| {}\n".format(num, translation,
                                                       str(cost))
                    output_file.write(line)
            else:
                best_hypo, cost = beam[0]
                line = util.seq2words(best_hypo, num_to_target) + '\n'
                output_file.write(line)
def init_test_rel_dict():
    """
    训练语料编号到关系类型的映射
    """
    all_lines = read_all_lines('corpus/TEST_FILE_KEY.TXT')
    num2rel_dict = nltk.defaultdict(str)
    for line in all_lines:
        num, rel = line.split('\t')[:]
        num2rel_dict[num] = rel
    return num2rel_dict
Exemplo n.º 7
0
    def translate_maxibatch(maxibatch, model_set, num_to_target,
                            num_prev_translated):
        """Translates an individual maxibatch.

        Args:
            maxibatch: a list of sentences.
            model_set: an InferenceModelSet object.
            num_to_target: dictionary mapping target vocabulary IDs to strings.
            num_prev_translated: the number of previously translated sentences.
        """

        # Sort the maxibatch by length and split into minibatches.
        try:
            minibatches, idxs = util.read_all_lines(configs[0], maxibatch,
                                                    minibatch_size)
        except exception.Error as x:
            logging.error(x.msg)
            sys.exit(1)

        # Translate the minibatches and store the resulting beam (i.e.
        # translations and scores) for each sentence.
        beams = []
        for x in minibatches:
            y_dummy = numpy.zeros(shape=(len(x),1))
            x, x_mask, _, _ = util.prepare_data(x, y_dummy, configs[0].factors,
                                                maxlen=None)
            sample = model_set.beam_search(
                session=session,
                x=x,
                x_mask=x_mask,
                beam_size=beam_size,
                normalization_alpha=normalization_alpha)
            beams.extend(sample)
            num_translated = num_prev_translated + len(beams)
            logging.info('Translated {} sents'.format(num_translated))

        # Put beams into the same order as the input maxibatch.
        tmp = numpy.array(beams, dtype=numpy.object)
        ordered_beams = tmp[idxs.argsort()]

        # Write the translations to the output file.
        for i, beam in enumerate(ordered_beams):
            if nbest:
                num = num_prev_translated + i
                for sent, cost in beam:
                    translation = util.seq2words(sent, num_to_target)
                    line = "{} ||| {} ||| {}\n".format(num, translation,
                                                       str(cost))
                    output_file.write(line)
            else:
                best_hypo, cost = beam[0]
                line = util.seq2words(best_hypo, num_to_target) + '\n'
                output_file.write(line)
def Post_arg(destination, inputFile, file):
    '''
    post process
    '''
    arg = os.path.join(destination, "%s.arg" % file)

    #     all attributes extracted from '.out' file
    attr_set = InputProcess(inputFile)

    #     get news time
    destination_file = os.path.join(destination, file)
    news_time = ''
    child_body = read_all_lines(destination_file)
    for child_line in child_body:
        if child_line.strip().startswith("time:"):
            news_time = child_line[5:].strip()


#     post generate .arg file
    f_arg = open(arg, 'w')
    for attr in attr_set:
        try:
            f_arg.write("==================\n")
            length = len(attr[0])
            line = attr[2] + ',' + str(int(attr[2]) + length - 1
                                       ) + ' ' + attr[1] + ' ' + attr[0] + '\n'
            f_arg.write(line)

            #         add place feature
            length_place = len(attr[9])
            if attr[9] != 'NULL':
                line = attr[10] + ',' + str(
                    int(attr[10]) + length_place -
                    1) + '\t' + 'place' + '\t' + attr[9] + '\n'
                f_arg.write(line)

    #         Add time Feature
            length_time = len(attr[7])
            if attr[7] != 'NULL':

                new_time = TimeRegularzation(news_time, attr[7])
                line = attr[8] + ',' + str(
                    int(attr[8]) + length_time -
                    1) + '\t' + 'time' + '\t' + new_time + '\n'
                #                     without changing time
                #                 line = attr[8] + ',' + str(int(attr[8])+length_time-1) + ' time ' +  attr[7]+ '\n'

                f_arg.write(line)
        except Exception, e:
            print Exception, ":", e
            continue
def CombineAllCoreference_t():
    '''
    process all files contained
    '''
    PATH = u"/home/lzh/Documents/SinoCoreferencer/突发事件/社会安全/topic62"

    childfiles = []
    for dirpath, dirpathnames, filenames in os.walk(PATH):
        directory = os.path.join(dirpath, 'directory.all')
        dir_lines = read_all_lines(directory)
        for line in dir_lines:
            print dirpath
            print line
            CombineCoreference(dirpath, line)
Exemplo n.º 10
0
def CombineCoreference(
    Destination,
    file,
):
    '''
    generate .coreference file ,which contains detail infomation about coreference events
    '''

    coref_file = file + '.coref.events'
    coref_dir = os.path.join(Destination, coref_file)
    if os.path.exists(coref_dir):

        coref_lines = read_all_lines(coref_dir)
        arg_file = file + '.arg'
        arg_dir = os.path.join(Destination, arg_file)
        arg_lines = read_all_lines(arg_dir)

        coref_attr_file = file + '.coreference3'
        coref_attr_dir = os.path.join(Destination, coref_attr_file)
        f_coref = open(coref_attr_dir, 'w')
        f_coref.close()

        writeCoreference_v2(coref_lines, arg_lines, coref_attr_dir)
def build_test_corpus(cut=False):
    """
    构建测试语料
    """
    num2rel_dict = init_test_rel_dict()
    file = open('corpus_handle/test.txt', 'w', encoding='utf-8')
    all_lines = read_all_lines('corpus/TEST_FILE.txt')
    for line in all_lines:
        items = line.split('\t')[:]
        num, sentence = items[0], items[1][1:-2]
        sentence = re.sub('\s+', ' ', sentence)
        sentence, index_1, index_2 = cut_sentence(sentence, cut=cut)  # 截取句子
        relation_type = num2rel_dict[num]
        file.write('%s|%d|%d|%s\n' %
                   (relation_type, index_1, index_2, sentence))
    file.close()
def build_train_corpus(cut=False):
    """
    构建训练语料
    """
    file = open('corpus_handle/train.txt', 'w', encoding='utf-8')
    all_lines = read_all_lines('corpus/TRAIN_FILE.TXT')
    times = int(len(all_lines) / 3)
    for i in range(times):  # 每次读取三行
        line_0, line_1, line_2 = all_lines[i * 3:(i + 1) * 3]
        # 句子
        sentence = line_0.split('\t')[1][1:-2]
        sentence = re.sub('\s+', ' ', sentence)
        sentence, index_1, index_2 = cut_sentence(sentence, cut=cut)
        # 关系类型
        relation_type = line_1.split('(')[0]
        file.write('%s|%d|%d|%s\n' %
                   (relation_type, index_1, index_2, sentence))
    file.close()
Exemplo n.º 13
0
def dataAnalysis():
    '''
    count the num of time both processed and unprocessed 
    '''
    lines = read_all_lines("/home/lzh/Downloads/data/collection.time")
    new_lines = []
    process = 0
    unprocess = 0
    for line in lines:
        time_re = TimeRegularzation("2016-11-28 19:37:00", line)
        new_lines.append(time_re)
        if time_re == '2016-11-28 0':
            unprocess += 1
        else:
            process += 1
    f_time = open("/home/lzh/Downloads/data/collection1.time", 'w')
    for l in new_lines:
        f_time.write(l + '\n')
    f_time.write("processed: %d" % process + '\n')
    f_time.write("unprocessed: %d" % unprocess + '\n')
Exemplo n.º 14
0
def Multi():
    '''
    time attribute statictics
    '''
    f_time = open("/home/lzh/Downloads/data/collection.time", 'w')
    PATH = u"/home/lzh/Documents/SinoCoreferencer/突发事件/社会安全"

    for dirpath, dirpathnames, filenames in os.walk(PATH):
        for each in dirpathnames:
            childfile = os.path.join(dirpath, each)
            directory = os.path.join(childfile, 'directory.all')
            dir_lines = read_all_lines(directory)
            for line in dir_lines:

                input_file = line + '.shtml.out'
                each_input = each + '_'
                input_dir = os.path.join('/home/lzh/Downloads/data/',
                                         each_input, input_file)
                time_set = ExtractTiming(input_dir)
                for t in time_set:
                    f_time.write(t + '\n')
Exemplo n.º 15
0
def FileGeneration(Destination, inputFile, file):
    '''
    create file un-contained
    '''
    arg = os.path.join(Destination, "%s.arg" % file)
    entities = os.path.join(Destination, '%s.coref.entities' % file)
    timeFile = os.path.join(Destination, '%s.time' % file)
    trigger = os.path.join(Destination, '%s.trigger' % file)
    typeFile = os.path.join(Destination, '%s.type' % file)
    valueFile = os.path.join(Destination, '%s.value' % file)
    xmlFile = os.path.join(Destination, '%s.xml' % file)

    #     all attributes extracted from '.out' file

    attr_set = InputProcess(inputFile)

    #     get news time
    destination_file = os.path.join(Destination, file)
    news_time = ''
    child_body = read_all_lines(destination_file)
    for child_line in child_body:
        if child_line.strip().startswith("time:"):
            news_time = child_line[5:].strip()
    '''
    add event attributes
    '''
    f_arg = open(arg, 'w')

    for attr in attr_set:
        try:
            f_arg.write("==================\n")
            length = len(attr[0])
            line = attr[2] + ',' + str(int(attr[2]) + length - 1
                                       ) + ' ' + attr[1] + ' ' + attr[0] + '\n'
            f_arg.write(line)

            #         add place feature
            length_place = len(attr[9])
            if attr[9] != 'NULL':
                line = attr[10] + ',' + str(int(attr[10]) + length_place -
                                            1) + ' place ' + attr[9] + '\n'
                f_arg.write(line)

    #         Add time Feature
            length_time = len(attr[7])
            if attr[7] != 'NULL':

                #                 change time to standard formulate
                #                 new_time = TimeRegularzation(news_time, attr[7])
                #                 line = attr[8] + ',' + str(int(attr[8])+length_time-1) + ' time ' + new_time + '\n'

                #                 without changing time
                line = attr[8] + ',' + str(int(attr[8]) + length_time -
                                           1) + ' time ' + attr[7] + '\n'

                f_arg.write(line)


#             add relation feature
#             for i in range(11,len(attr)):
#                 relation = attr[i].split(':')
#                 pos1 = attr[4].index(relation[1])
#                 pos2 = attr[4].index(attr[0])
#                 length_relation = len(relation[1])
#                 start_pos = int(attr[2])+pos1-pos2
#                 line = str(start_pos) + ',' + str(start_pos+length_relation-1) + ' ' + relation[0] + ' ' + relation[1] + '\n'
#                 f_arg.write(line)

            if not os.path.exists(entities):
                open(entities, 'w')

            f_time = open(timeFile, 'w')
            for attr in attr_set:
                length = len(attr[7])
                if attr[7] != 'NULL':

                    new_time = TimeRegularzation(news_time, attr[7])
                    line = attr[8] + ',' + str(
                        int(attr[8]) + length -
                        1) + ' time time ' + new_time + '\n'
                    #                     without changing time
                    #                     line = attr[8] + ',' + str(int(attr[8])+length-1) + ' time time ' + attr[7] + '\n'

                    f_time.write(line)

            f_trigger = open(trigger, 'w')
            for attr in attr_set:
                length = len(attr[0])
                line = attr[2] + ',' + str(
                    int(attr[2]) + length -
                    1) + ' ' + attr[1] + ' ' + attr[0] + '\n'
                f_trigger.write(line)

            if not os.path.exists(typeFile):
                open(typeFile, 'w')

            if not os.path.exists(valueFile):
                open(valueFile, 'w')
        except Exception, e:
            print Exception, ":", e
            continue