Пример #1
0
def batch_loader(finput, fref, ftarget, bsize, maxpad, maxpart, maxtoken, minbsize):

	rsi = []
	rsr = []
	rst = []
	nd = maxlen = mlen_i = mlen_r = 0
	_bsize = bsize
	for i_d, rd, td in zip(list_reader(finput), list_reader(fref), line_reader(ftarget)):
		lid = len(i_d)
		lrd = len(rd)
		lgth = lid + lrd
		if maxlen == 0:
			maxlen = lgth + min(maxpad, lgth // maxpart + 1)
			_bsize = get_bsize(maxlen, maxtoken, bsize)
		if (nd < minbsize) or (lgth <= maxlen and nd < _bsize):
			rsi.append(i_d)
			rsr.append(rd)
			rst.append(float(td))
			if lid > mlen_i:
				mlen_i = lid
			if lrd > mlen_r:
				mlen_r = lrd
			nd += 1
		else:
			yield rsi, rsr, rst, mlen_i, mlen_r
			rsi = [i_d]
			rsr = [rd]
			rst = [float(td)]
			mlen_i = lid
			mlen_r = lrd
			maxlen = lgth + min(maxpad, lgth // maxpart + 1)
			_bsize = get_bsize(maxlen, maxtoken, bsize)
			nd = 1
	if rsi:
		yield rsi, rsr, rst, mlen_i, mlen_r
Пример #2
0
def batch_loader(finput, bsize, maxpad, maxpart, maxtoken, minbsize):

    rsi = []
    nd = maxlen = minlen = mlen_i = 0
    _bsize = bsize
    for i_d in list_reader(finput):
        lgth = len(i_d)
        if maxlen == 0:
            _maxpad = max(1, min(maxpad, lgth // maxpart + 1) // 2)
            maxlen = lgth + _maxpad
            minlen = lgth - _maxpad
            _bsize = get_bsize(maxlen, maxtoken, bsize)
        if (nd < minbsize) or (lgth <= maxlen and lgth >= minlen
                               and nd < _bsize):
            rsi.append(i_d)
            if lgth > mlen_i:
                mlen_i = lgth
            nd += 1
        else:
            yield rsi, mlen_i
            rsi = [i_d]
            mlen_i = lgth
            _maxpad = max(1, min(maxpad, lgth // maxpart + 1) // 2)
            maxlen = lgth + _maxpad
            minlen = lgth - _maxpad
            _bsize = get_bsize(maxlen, maxtoken, bsize)
            nd = 1
    if rsi:
        yield rsi, mlen_i
Пример #3
0
def batch_loader(finput, ftarget, bsize, maxpad, maxpart, maxtoken, minbsize):

	rsi = []
	rst = []
	nd = maxlen = mlen_i = mlen_t = 0
	for i_d, td in zip(list_reader(finput), list_reader(ftarget)):
		lid = len(i_d)
		ltd = len(td)
		lgth = lid + ltd
		if maxlen == 0:
			maxlen = lgth + min(maxpad, lgth // maxpart + 1)
			_bsize = get_bsize(maxlen, maxtoken, bsize)
		if (nd < minbsize) or (lgth <= maxlen and nd < _bsize):
			rsi.append(i_d)
			rst.append(td)
			if lid > mlen_i:
				mlen_i = lid
			if ltd > mlen_t:
				mlen_t = ltd
			nd += 1
		else:
			yield rsi, rst, mlen_i, mlen_t
			rsi = [i_d]
			rst = [td]
			mlen_i = lid
			mlen_t = ltd
			maxlen = lgth + min(maxpad, lgth // maxpart + 1)
			_bsize = get_bsize(maxlen, maxtoken, bsize)
			nd = 1
	if rsi:
		yield rsi, rst, mlen_i, mlen_t
Пример #4
0
def batch_loader(finput, ftarget, bsize, maxpad, maxpart, maxtoken, minbsize):

    rsi = []
    rst = []
    nd = maxlen = minlen = mlen_i = mlen_t = nsent = 0
    _bsize = bsize
    for (i_d, i_lgth), (td, t_lgth) in zip(doc_reader(finput),
                                           doc_reader(ftarget)):
        cur_nsent = len(i_d)
        lgth = i_lgth + t_lgth
        if maxlen == 0:
            _maxpad = max(1, min(maxpad, lgth // maxpart + 1) // 2)
            maxlen = lgth + _maxpad
            minlen = lgth - _maxpad
            _bsize = max(1, get_bsize(maxlen, maxtoken, bsize) // cur_nsent)
        if nsent == 0:
            nsent = cur_nsent
        if (cur_nsent == nsent) and (
            (nd < minbsize) or
            (lgth <= maxlen and lgth >= minlen and nd < _bsize)):
            rsi.append(i_d)
            rst.append(td)
            if i_lgth > mlen_i:
                mlen_i = i_lgth
            if t_lgth > mlen_t:
                mlen_t = t_lgth
            nd += 1
        else:
            yield rsi, rst, mlen_i, mlen_t, nsent
            rsi = [i_d]
            rst = [td]
            mlen_i = i_lgth
            mlen_t = t_lgth
            nsent = cur_nsent
            _maxpad = max(1, min(maxpad, lgth // maxpart + 1) // 2)
            maxlen = lgth + _maxpad
            minlen = lgth - _maxpad
            _bsize = max(1, get_bsize(maxlen, maxtoken, bsize) // cur_nsent)
            nd = 1
    if rsi:
        yield rsi, rst, mlen_i, mlen_t, nsent
Пример #5
0
def batch_loader(finput, ftarget, bsize, maxpad, maxpart, maxtoken, minbsize):

    _f_maxpart = float(maxpart)
    rsi = []
    rst = []
    rstask = None
    nd = maxlen = mlen_i = mlen_t = 0
    for i_d, td in zip(list_reader(finput), list_reader(ftarget)):
        lid = len(i_d) - 1
        ltd = len(td)
        lgth = lid + ltd
        _task = i_d[0]
        # uncomment the following 2 lines to filter out empty data (e.g. in OPUS-100).
        #if (lid <= 0) or (ltd <= 0):
        #continue
        if maxlen == 0:
            maxlen = lgth + min(maxpad, ceil(lgth / _f_maxpart))
            _bsize = get_bsize(maxlen, maxtoken, bsize)
            rstask = _task
        if (rstask == _task) and ((nd < minbsize) or
                                  (lgth <= maxlen and nd < _bsize)):
            rsi.append(i_d[1:])
            rst.append(td)
            if lid > mlen_i:
                mlen_i = lid
            if ltd > mlen_t:
                mlen_t = ltd
            nd += 1
        else:
            yield rsi, rst, rstask, mlen_i, mlen_t
            rsi = [i_d[1:]]
            rstask = _task
            rst = [td]
            mlen_i = lid
            mlen_t = ltd
            maxlen = lgth + min(maxpad, ceil(lgth / _f_maxpart))
            _bsize = get_bsize(maxlen, maxtoken, bsize)
            nd = 1
    if rsi:
        yield rsi, rst, rstask, mlen_i, mlen_t
Пример #6
0
def batch_loader(finput, fmt, ftarget, bsize, maxpad, maxpart, maxtoken, minbsize):

	_f_maxpart = float(maxpart)
	rsi = []
	rsm = []
	rst = []
	nd = maxlen = mlen_i = mlen_m = mlen_t = 0
	for i_d, md, td in zip(list_reader(finput), list_reader(fmt), list_reader(ftarget)):
		lid = len(i_d)
		lmd = len(md)
		ltd = len(td)
		lgth = lid + lmd + ltd
		if maxlen == 0:
			maxlen = lgth + min(maxpad, ceil(lgth / _f_maxpart))
			_bsize = get_bsize(maxlen, maxtoken, bsize)
		if (nd < minbsize) or (lgth <= maxlen and nd < _bsize):
			rsi.append(i_d)
			rsm.append(md)
			rst.append(td)
			if lid > mlen_i:
				mlen_i = lid
			if lmd > mlen_m:
				mlen_m = lmd
			if ltd > mlen_t:
				mlen_t = ltd
			nd += 1
		else:
			yield rsi, rsm, rst, mlen_i, mlen_m, mlen_t
			rsi = [i_d]
			rsm = [md]
			rst = [td]
			mlen_i = lid
			mlen_m = lmd
			mlen_t = ltd
			maxlen = lgth + min(maxpad, ceil(lgth / _f_maxpart))
			_bsize = get_bsize(maxlen, maxtoken, bsize)
			nd = 1
	if rsi:
		yield rsi, rsm, rst, mlen_i, mlen_m, mlen_t
Пример #7
0
def batch_loader(finput, bsize, maxpad, maxpart, maxtoken, minbsize):

    _f_maxpart = float(maxpart)
    rsi = []
    rstask = None
    nd = maxlen = minlen = mlen_i = 0
    for i_d in list_reader(finput):
        lgth = len(i_d) - 1
        _task = i_d[0]
        #if lgth <= 0:
        #continue
        if maxlen == 0:
            _maxpad = max(1, min(maxpad, ceil(lgth / _f_maxpart)) // 2)
            maxlen = lgth + _maxpad
            minlen = lgth - _maxpad
            _bsize = get_bsize(maxlen, maxtoken, bsize)
            rstask = _task
        if (rstask == _task) and (
            (nd < minbsize) or
            (lgth <= maxlen and lgth >= minlen and nd < _bsize)):
            rsi.append(i_d[1:])
            if lgth > mlen_i:
                mlen_i = lgth
            nd += 1
        else:
            yield rsi, rstask, mlen_i
            rsi = [i_d[1:]]
            rstask = _task
            mlen_i = lgth
            _maxpad = max(1, min(maxpad, ceil(lgth / _f_maxpart)) // 2)
            maxlen = lgth + _maxpad
            minlen = lgth - _maxpad
            _bsize = get_bsize(maxlen, maxtoken, bsize)
            nd = 1
    if rsi:
        yield rsi, rstask, mlen_i