def semi_supervised(samples_path, write_path, beam_search):
    """use reference to predict source

    Args:
        samples_path (str): The path of reference
        write_path (str): The path of new samples

    """
    pred = Predict()
    print('vocab_size: ', len(pred.vocab))
    # Randomly pick a sample in test set to predict.
    count = 0
    semi = []

    with open(samples_path, 'r') as f:
        for picked in f:
            count += 1
            source, ref = picked.strip().split('<sep>')
            prediction = pred.predict(ref.split(), beam_search=beam_search)
            semi.append(prediction + ' <sep> ' + ref)

            if count % 100 == 0:
                print(count)
                write_samples(semi, write_path, 'a')
                semi = []
Exemplo n.º 2
0
def semi_supervised(samples_path, write_path, beam_search):
    """use reference to predict source

    Args:
        samples_path (str): The path of reference
        write_path (str): The path of new samples

    """
    ###########################################
    #          TODO: module 3 task 1          #
    ###########################################
    pred = Predict()
    print('vocab_size:', len(pred.vocab))
    count = 0
    semi = []

    with open(samples_path, 'r') as f:
        for picked in f:
            count += 1
            source, ref = picked.strip().split('<sep>')
            prediction = pred.predict(ref.split(), beam_search=beam_search)
            # 拼接ref的预测结果与ref,形成新的样本
            semi.append(prediction + ' <sep> ' + ref)

            if count % 100 == 0:
                print(count)
                write_samples(semi, write_path, 'a')
                semi = []
Exemplo n.º 3
0
def translate_continue(sample_path, translate_path):
    """translate  original file to new file

    Args:
        sample_path (str): original file path
        translate_path (str): target file path
    Returns:
        (str): result of back translation
    """
    # if file is exist open it ,get length,otherwise build it
    if os.path.exists(translate_path):
        with open(translate_path, 'r+', encoding='utf8') as file:
            exit_len = len(list(file))
    else:
        with open(translate_path, 'w', encoding='utf8') as file:
            exit_len = 0

    translated = []
    count = 0
    with open(sample_path, 'r', encoding='utf8') as file:
        for line in file:
            count += 1
            print(count)
            if count <= exit_len or count == 21585:
                continue
            # source back_translate  and ref back_translate
            source, ref = tuple(line.strip().split('<sep>'))
            source = back_translate(source.strip())
            if not source:
                time.sleep(1.5)
                continue
            ref = back_translate(ref.strip())
            if not ref:
                time.sleep(1.5)
                continue
            source = ' '.join(list(jieba.cut(source)))
            ref = ' '.join(list(jieba.cut(ref)))
            translated.append(source + ' <sep> ' + ref)
            #  storage back_translate result
            if count % 10 == 0:
                print(count)
                write_samples(translated, translate_path, 'a')
                translated = []
            if count == 1010:
                # write_samples(translated, translate_path, 'a')
                break
Exemplo n.º 4
0
    def generate_samples(self, write_path):
        """generate new samples file
        Args:
            write_path (str):  new samples file path

        """
        replaced = []
        count = 0
        for sample, token_list, doc in zip(self.samples, self.refs, self.corpus):
            count += 1
            if count % 100 == 0:
                print(count)
                write_samples(replaced, write_path, 'a')
                replaced = []
            replaced.append(
                sample.split('<sep>')[0] + ' <sep> ' + self.replace(token_list, doc)
                )
def translate_continue(sample_path, translate_path):
    """translate  original file to new file

    Args:
        sample_path (str): original file path
        translate_path (str): target file path
    Returns:
        (str): result of back translation
    """
    ###########################################
    #          TODO: module 2 task 3          #
    ###########################################
    if os.path.exists(translate_path):
        with open(translate_path, 'r+', encoding='urf-8') as file:
            exit_len = len(list(file))
    else:
        # with open(translate_path, 'w', encoding='urf-8') as file:
        exit_len = 0

    translated = []
    count = 0
    with open(curPath + sample_path, 'r', encoding='utf-8') as file:
        for line in file:
            count += 1
            print(count)
            if count <= exit_len or count == 21585:
                continue
            source, ref = tuple(line.strip().split('<sep>'))
            source = back_translate(source.strip())
            ref = back_translate(ref.strip())
            source = ' '.join(list(jieba.cut(source)))
            ref = ' '.join(list(jieba.cut(ref)))
            translated.append(source + ' <sep> ' + ref)
            if count % 10 == 0:
                print(count)
                write_samples(translated, translate_path, 'a')
                translated = []
                if count == 1000:
                    break
    def generate_samples(self, write_path):
        """generate new samples file
        通过替换reference中的词生成新的reference样本

        Args:
            write_path (str):  new samples file path

        """
        ###########################################
        #          TODO: module 1 task 3          #
        ###########################################
        replaced = []
        count = 0
        for sample, token_list, doc in zip(self.samples, self.refs,
                                           self.corpus):
            replaced.append(
                sample.split('<sep>')[0] + ' <sep> ' +
                self.replace(token_list, doc))
            count += 1
            if count % 100 == 0:
                print(count)
                write_samples(replaced, write_path, 'a')
                replaced = []
Exemplo n.º 7
0
from data_utils import write_samples, partition

abs_path = pathlib.Path(__file__).parent.absolute()
sys.path.append(sys.path.append(abs_path))
curPath = os.path.abspath(os.path.dirname(__file__)) + '/'

samples = set()
# Read json file.
json_path = os.path.join(abs_path, '../files/服饰_50k.json')
with open(json_path, 'r', encoding='utf8') as file:
    jsf = json.load(file)

for jsobj in jsf.values():
    title = jsobj['title'] + ' '  # Get title.
    kb = dict(jsobj['kb']).items()  # Get attributes.
    kb_merged = ''
    for key, val in kb:
        kb_merged += key + ' ' + val + ' '  # Merge attributes.

    ocr = ' '.join(list(jieba.cut(jsobj['ocr'])))  # Get OCR text.
    texts = []
    texts.append(title + ocr + kb_merged)  # Merge them.
    reference = ' '.join(list(jieba.cut(jsobj['reference'])))
    for text in texts:
        sample = text + '<sep>' + reference  # Seperate source and reference.
        samples.add(sample)
write_path = os.path.join(abs_path, '../files/samples.txt')
write_samples(samples, write_path)
partition(samples)
Exemplo n.º 8
0
    for line in train:
        if len(line.split("\t")) != 4: continue
        line = line.replace('\n', '').replace('\r', '')
        target = ' '.join(list(jieba.cut(line.split("\t")[1])))
        text = ' '.join(list(jieba.cut(line.split("\t")[2])))
        if line.split("\t")[3] == "FAVOR":
            stance = 1
        elif line.split("\t")[3] == "AGAINST":
            stance = -1
        else:
            stance = 0
        # stance = line.split("\t")[3]
        t_sample = text+'\n'+target+'\n'+str(stance)
        train_samples.add(t_sample)
train_write_path = os.path.join(abs_path, 'train-3000-seg.txt')
write_samples(train_samples, train_write_path)

# process test file
with open(dev_file_path, 'r', encoding='utf8') as dev:
    for line in dev:
        if len(line.split("\t")) != 4: continue
        line = line.replace('\n', '').replace('\r', '')
        target = ' '.join(list(jieba.cut(line.split("\t")[1])))
        text =' '.join(list(jieba.cut(line.split("\t")[2])))
        if line.split("\t")[3] == "FAVOR":
            stance = 1
        elif line.split("\t")[3] == "AGAINST":
            stance = -1
        else:
            stance = 0
        # stance = line.split("\t")[3]