Пример #1
0
    def _read_file(cls, input_file):
        """Read tsv file, and return words and label as list"""
        with open(input_file, "r", encoding="utf-8") as f:
            sentences = []

            sentence = [[], []]  # [[words], [tags], img_id]
            for line in f:
                if line.strip() == "":
                    continue

                if line.startswith("IMGID:"):
                    if sentence[0]:
                        sentences.append(sentence)
                        sentence = [[], []]  # Flush

                    # Add img_id at last
                    img_id = int(line.replace("IMGID:", "").strip())
                    sentence.append(img_id)
                else:
                    try:
                        word, tag = line.strip().split("\t")
                        word = preprocess_word(word)
                        sentence[0].append(word)
                        sentence[1].append(tag)
                    except:
                        logger.info("\"{}\" cannot be splitted".format(
                            line.rstrip()))
            # Flush the last one
            if sentence[0]:
                sentences.append(sentence)

            return sentences
Пример #2
0
def main():
    argv = sys.argv

    if len(argv) < 3:
        sys.exit(0)

    # 加载模块路径
    translate = load_translate_mod(argv[1])

    # 执行翻译
    res = translate(preprocess_word(' '.join(argv[2:])))
    sys.stdout.write(str_encode(res))
Пример #3
0
                    # dot only at the end and this word is absent
                    # in the existed words set)
                    text_data = " ".join([
                        word[:-1] + "\n" if len(word) > 2 and word[-1] == "."
                        and "." not in word[:-1] and word not in existed_words
                        else word for word in text_data.split()
                    ])

                    text_data = re.sub("<s[^>]+>", "<s>", text_data)
                    text_data = re.sub("<s>", "{", text_data)
                    text_data = re.sub("</s>", "}", text_data)

                    part_data = re.finditer(
                        r"\{(.*?)\}", text_data,
                        re.MULTILINE | re.DOTALL)  # take the internal of {...}

                    for lines in part_data:
                        lines = lines.group(1).strip()
                        lines = re.sub(" +", " ", lines)

                        for line in lines.split("\n"):
                            sentence = []
                            for raw_word in line.split(" "):
                                word = preprocess_word(raw_word)
                                if len(word) > 0:
                                    sentence.append(word)
                            if len(sentence) > 0:
                                f_lm.write(" ".join(sentence) + "\n")

    print("Done!", flush=True)