def __init__(self): TextUtils.__init__(self) self.firsts = {} self.lasts = {} self.unigram = {} self.bigram = {} self.trigram = {}
def preprocess_data(input_list): insts = [] for line in input_list: line = TextUtils.remove_blank(line) token_seq = TextUtils.tokenize(line) insts.append(Instance(token_seq, tag=-1)) return insts
# https://www.gairuo.com/file/data/dataset/GDP-China.csv df = pd.DataFrame({ "id": [1, 2], "name": ["Kevin", "Jenny"], "address": [{ "hometown": "Meizhou", "work": "Guangzhou" }, { "hometown": "Hangzhou", "work": "Guangzhou" }], "contact": [{ "mobile": ["+86 16888", "+86 168888"], "mail": "*****@*****.**" }, { "mobile": ["+86 16666", "+86 166666"], "mail": "*****@*****.**" }], "magic": [Decimal('000001.10000010'), Decimal('000002.20000020')] }) df['contact'] = df['contact'].apply(lambda c: TextUtils.flatten_dict( c, formatter=PrimitiveKVFormatter(), array_index_start=1)) with pd.option_context("expand_frame_repr", False, "display.max_rows", None, "display.max_colwidth", None): print(df)
sys.exit(1) text_font_size = lpt.get_mfs() all_sizes = lpt.get_font_sizes() heading_size = all_sizes[all_sizes.index(text_font_size)+1] paper = etree.Element("paper") curSection = None mainBodyStarted = False for c in lpt.chunks: chunk_font_size = c.get_mf_attr("font-size") chunk_font_style = c.get_mf_attr("font-style") chunkText = " ".join([i.text for i in c.words]) chunkText = TextUtils.fix_wide_letters(TextUtils.remove_hyphens(chunkText)) # print " ".join([i.text for i in c.words]) if chunk_font_size >= text_font_size and chunk_font_size <= heading_size and chunk_font_style == FontStyle.Bold: if not mainBodyStarted and re.match("abstract", chunkText, flags = re.IGNORECASE): mainBodyStarted = True if not mainBodyStarted: continue curSection = etree.SubElement(paper, "section") m = re.match("^((?:\d\.)*\d)\s+(.*)$", chunkText) if m: curSection.attrib["number"] = m.group(1) chunkText = m.group(2) curSection.attrib["name"] = chunkText curSection.text = ""