def _read_file(cls, input_file): """Read tsv file, and return words and label as list""" with open(input_file, "r", encoding="utf-8") as f: sentences = [] sentence = [[], []] # [[words], [tags], img_id] for line in f: if line.strip() == "": continue if line.startswith("IMGID:"): if sentence[0]: sentences.append(sentence) sentence = [[], []] # Flush # Add img_id at last img_id = int(line.replace("IMGID:", "").strip()) sentence.append(img_id) else: try: word, tag = line.strip().split("\t") word = preprocess_word(word) sentence[0].append(word) sentence[1].append(tag) except: logger.info("\"{}\" cannot be splitted".format( line.rstrip())) # Flush the last one if sentence[0]: sentences.append(sentence) return sentences
def main(): argv = sys.argv if len(argv) < 3: sys.exit(0) # 加载模块路径 translate = load_translate_mod(argv[1]) # 执行翻译 res = translate(preprocess_word(' '.join(argv[2:]))) sys.stdout.write(str_encode(res))
# dot only at the end and this word is absent # in the existed words set) text_data = " ".join([ word[:-1] + "\n" if len(word) > 2 and word[-1] == "." and "." not in word[:-1] and word not in existed_words else word for word in text_data.split() ]) text_data = re.sub("<s[^>]+>", "<s>", text_data) text_data = re.sub("<s>", "{", text_data) text_data = re.sub("</s>", "}", text_data) part_data = re.finditer( r"\{(.*?)\}", text_data, re.MULTILINE | re.DOTALL) # take the internal of {...} for lines in part_data: lines = lines.group(1).strip() lines = re.sub(" +", " ", lines) for line in lines.split("\n"): sentence = [] for raw_word in line.split(" "): word = preprocess_word(raw_word) if len(word) > 0: sentence.append(word) if len(sentence) > 0: f_lm.write(" ".join(sentence) + "\n") print("Done!", flush=True)