def dfs_get(node, arg, q, ad, flag): comone = arg['common']['e'] tree = arg['tree'] tk = arg['tk'] for i_child in node['c']: child = tree[i_child] if child['x'].get('tag', -1) == arg['tag']: if 'ansm' not in arg: if len(child['c']) > 0: dfs_get(child, arg, q, ad, flag) else: q['pos'].append(len(q['mod'])) q['mod'].append(tk[child['x']['r']][1]) q['word'].append(tk[child['x']['r']][1]) else: sub = arg['ansm'] # q['mod'] += sub + '(%s) ' % arg['common']['e'] lmod = len(q['mod']) q['pos'] += [lmod + i for i in arg['ansp']] q['mod'] += sub if child['e'] == 'PP': qmod = q['mod'][:] qpos = q['pos'][:] dfs_get(child, arg, q, '', flag) q['mod'] = qmod[:] q['pos'] = qpos[:] else: q['word'] += sub # the common node elif getq(comone) in ['NN', 'NP'] and child['e'] == 'VP': q['ch']['v'] &= not checkbe(node, arg) elif comone in ['ADVP', 'RB', 'RBR'] and \ (getq(child['e']) not in ['VB', 'VP', 'JJ'] and child['e'] not in ['ADJP']): pass elif child['e'] == 'PP': dfs_get(child, arg, q, 'VBG', flag) # PP extend elif child['e'] in ['S'] and len( child['c']) == 1 and tree[child['c'][0]]['e'] == "VP": dfs_get(tree[child['c'][0]], arg, q, 'VBG', flag) # S extend elif getq(comone) in ['NN', 'NP'] and child['e'] in ['S', 'SBAR']: pass # N S forbid # elif comone in ['ADJP', 'JJ'] and child['e'] in ['S', 'SBAR'] and q['ch']['w'] == 0: # pass # J forbid elif child['e'] in ['ADVP', 'RB', 'RBR']: if comone not in ['ADVP', 'RB', 'RBR']: flag['advp'] = True # ignore S after N else: modeget(child, arg, q, ad) if q['ch']['g'] > 0: break
def dfs_get(node, arg, q, ad, flag): comone = arg['common'].elem for child in node.children: if child.extra['tag'] == arg['tag']: if 'ansm' not in arg: if len(child.children) > 0: dfs_get(child, arg, q, ad, flag) else: q['pos'].append(len(q['mod'])) q['mod'].append(child.extra['rep']['l']) q['word'].append(child.extra['rep']['l']) else: sub = arg['ansm'] # q['mod'] += sub + '(%s) ' % arg['common'].elem lmod = len(q['mod']) q['pos'] += [lmod + i for i in arg['ansp']] q['mod'] += sub if child.elem == 'PP': qmod = q['mod'][:] qpos = q['pos'][:] dfs_get(child, arg, q, '', flag) q['mod'] = qmod[:] q['pos'] = qpos[:] else: q['word'] += sub # the common node elif getq(comone) in ['NN', 'NP'] and child.elem == 'VP': checkbe = node.parent.children[0].children if len(checkbe[0].children) == 0: wd = arg['tk'][int(checkbe[0].elem)]['l'] q['ch']['v'] &= wd not in ['be'] elif comone in ['ADVP', 'RB', 'RBR'] and \ (getq(child.elem) not in ['VB', 'VP', 'JJ'] and child.elem not in ['ADJP']): pass elif child.elem == 'PP': dfs_get(child, arg, q, 'VBG', flag) # PP extend elif child.elem in ['S'] and len(child.children) == 1 and child.children[0].elem == "VP": dfs_get(child.children[0], arg, q, 'VBG', flag) # S extend elif getq(comone) in ['NN', 'NP'] and child.elem in ['S', 'SBAR']: pass # N S forbid # elif comone in ['ADJP', 'JJ'] and child.elem in ['S', 'SBAR'] and q['ch']['w'] == 0: # pass # J forbid elif child.elem in ['ADVP', 'RB', 'RBR']: if comone not in ['ADVP', 'RB', 'RBR']: flag['advp'] = True # ignore S after N else: modeget(child, arg, q, ad) if q['ch']['g'] > 0: break
def dfs_get(node, arg, q, ad, flag): comone = arg['common']['e'] tree = arg['tree'] tk = arg['tk'] for i_child in node['c']: child = tree[i_child] if child['x'].get('tag', -1) == arg['tag']: if 'ansm' not in arg: if len(child['c']) > 0: dfs_get(child, arg, q, ad, flag) else: q['pos'].append(len(q['mod'])) q['mod'].append(tk[child['x']['r']][1]) q['word'].append(tk[child['x']['r']][1]) else: sub = arg['ansm'] # q['mod'] += sub + '(%s) ' % arg['common']['e'] lmod = len(q['mod']) q['pos'] += [lmod + i for i in arg['ansp']] q['mod'] += sub if child['e'] == 'PP': qmod = q['mod'][:] qpos = q['pos'][:] dfs_get(child, arg, q, '', flag) q['mod'] = qmod[:] q['pos'] = qpos[:] else: q['word'] += sub # the common node elif getq(comone) in ['NN', 'NP'] and child['e'] == 'VP': q['ch']['v'] &= not checkbe(node, arg) elif comone in ['ADVP', 'RB', 'RBR'] and \ (getq(child['e']) not in ['VB', 'VP', 'JJ'] and child['e'] not in ['ADJP']): pass elif child['e'] == 'PP': dfs_get(child, arg, q, 'VBG', flag) # PP extend elif child['e'] in ['S'] and len(child['c']) == 1 and tree[child['c'][0]]['e'] == "VP": dfs_get(tree[child['c'][0]], arg, q, 'VBG', flag) # S extend elif getq(comone) in ['NN', 'NP'] and child['e'] in ['S', 'SBAR']: pass # N S forbid # elif comone in ['ADJP', 'JJ'] and child['e'] in ['S', 'SBAR'] and q['ch']['w'] == 0: # pass # J forbid elif child['e'] in ['ADVP', 'RB', 'RBR']: if comone not in ['ADVP', 'RB', 'RBR']: flag['advp'] = True # ignore S after N else: modeget(child, arg, q, ad) if q['ch']['g'] > 0: break
def modeget(child, arg, c, ad=False): c[0] &= child.elem[0].isalpha() c[0] &= child.elem not in ['MD', 'CC'] c[1] += 1 if child.elem in ['IN', 'TO']: return arg['tk'][int(child.children[0].elem)]['l'] + ' ' elif child.elem in ['DT', 'PRP$']: c[1] -= 1 return '' elif ad and child.elem in ['VBG', 'VBD']: c[2] |= 1 return child.elem + ' ' elif child.elem in ['SBAR']: return 'S ' elif len(child.children) == 1 and len(child.children[0].children) == 0: tk = arg['tk'][int(child.children[0].elem)]['l'] if tk in ['be']: return tk + ' ' elif tk in ['have']: c[1] -= 1 return '' else: return getq(child.elem) + ' ' else: return child.elem + ' '
def get_qtree_db(tree, tokens, key, ctype): nkey = [] for k in key: if not is_upper(tokens[k]): nkey.append(k) if ctype == 2: # keys = [stemmer_value(tokens[k]) for k in nkey] keys = [tokens[k]['lemma'] for k in nkey] keys.sort(key=lambda word: -len(word)) # rs = cl.find({'tokens.s': {'$all': keys}}) rs = cl.find({'tokens.l': {'$all': keys}}) else: # keys = [{'$elemMatch': {'s': stemmer_value(tokens[k]), 'q': getq(tokens[k]['pos'])}} for k in nkey] # keys.sort(key=lambda word: -len(word['$elemMatch']['s'])) # rs = cl.find({'tokens': {'$all': keys}}) keys = [{ '$elemMatch': { 'l': tokens[k]['lemma'], 'q': getq(tokens[k]['pos']) } } for k in nkey] keys.sort(key=lambda word: -len(word['$elemMatch']['l'])) rs = cl.find({'tokens': {'$all': keys}}) # retJson = {'result': [], 'desc': {'sen': []}} # senmap = {} # senlist = retJson['desc']['sen'] retJson = {'result': [], 'desc': {'sen': []}} senmap2 = {} senlist2 = retJson['desc']['sen'] strlist = retJson['result'] for sen in rs: sent = sen['tree0'] tk = sen['tokens'] qtree = transfer_Node_i(sent) tp = check_find(tree, key, tokens, qtree, tk, ctype) if tp: # senId = addCluster(senmap, senlist, tp.resultSent) (senId2, flag1) = addCluster(senmap2, senlist2, tp.resultSent2, {'len': tp.cost}) if flag1: markSent = cleaned_sentence([w['t'] for w in tk], tp.qkey) resultDict = {'sentence': markSent, 'sen': senId2} strlist.append(resultDict) senlist2.sort(key=lambda word: -word['count'] * 100 + word['len'] if word['title'] != '_others_' else 0) result_part(retJson) return retJson
def get_qtree_db(tree, tokens, key, ctype): nkey = [] for k in key: if not is_upper(tokens[k]): nkey.append(k) if ctype == 2: # keys = [stemmer_value(tokens[k]) for k in nkey] keys = [tokens[k]['lemma'] for k in nkey] keys.sort(key=lambda word: -len(word)) # rs = cl.find({'tokens.s': {'$all': keys}}) rs = cl.find({'tokens.l': {'$all': keys}}) else: # keys = [{'$elemMatch': {'s': stemmer_value(tokens[k]), 'q': getq(tokens[k]['pos'])}} for k in nkey] # keys.sort(key=lambda word: -len(word['$elemMatch']['s'])) # rs = cl.find({'tokens': {'$all': keys}}) keys = [{'$elemMatch': {'l': tokens[k]['lemma'], 'q': getq(tokens[k]['pos'])}} for k in nkey] keys.sort(key=lambda word: -len(word['$elemMatch']['l'])) rs = cl.find({'tokens': {'$all': keys}}) # retJson = {'result': [], 'desc': {'sen': []}} # senmap = {} # senlist = retJson['desc']['sen'] retJson = {'result': [], 'desc': {'sen': []}} senmap2 = {} senlist2 = retJson['desc']['sen'] strlist = retJson['result'] for sen in rs: sent = sen['tree0'] tk = sen['tokens'] qtree = transfer_Node_i(sent) tp = check_find(tree, key, tokens, qtree, tk, ctype) if tp: # senId = addCluster(senmap, senlist, tp.resultSent) (senId2, flag1) = addCluster(senmap2, senlist2, tp.resultSent2, {'len': tp.cost}) if flag1: markSent = cleaned_sentence([w['t'] for w in tk], tp.qkey) resultDict = {'sentence': markSent, 'sen': senId2} strlist.append(resultDict) senlist2.sort(key=lambda word: -word['count'] * 100 + word['len'] if word['title'] != '_others_' else 0) result_part(retJson) return retJson
def comnex_add(node, arg): q = {'mod': [], 'word': [], 'pos': [], 'ch': {'v': True, 'w': 0, 'g': 0, 'sn': 1, 'sj': 1, 'sv': 1}} comone = arg['common'].elem ad = '' ed = vm = br = False childe = [child.elem for child in node.children] if node.parent.elem == 'VP' and node.elem == 'VP': checkbe = node.parent.children[0].children ts = arg['common'].extra['rep']['p'] if len(checkbe[0].children) == 0: wd = arg['tk'][int(checkbe[0].elem)]['l'] if ts == 'VBN' and wd in ['be']: q['mod'].append('be') q['word'].append('be') ed = True # passive if ts == 'VBN' and wd in ['have'] or ts == 'VBG' and wd in ['be'] \ or ts == 'VB' and wd in ['will', 'would', 'to', 'do']: br = True # break later # verb example if node.elem in ['S', 'SBAR'] and ('ADJP' in childe or 'JJ' in childe): return 0 # make it clear (that) if comone in ['ADJP', 'JJ']: q['ch']['sj'] -= 1 if node.elem == 'NP': if getq(comone) in ['NN', 'NP']: q['ch']['sn'] -= 1 ad = 'N V' # complicate NP if getq(comone) in ['VB', 'VP']: vm = True # keep the tense if node.elem == 'ADJP' and comone in ['ADJP', 'JJ'] and node.parent.elem == 'VP': checkbe = node.parent.children[0].children if len(checkbe[0].children) == 0: wd = arg['tk'][int(checkbe[0].elem)]['l'] if wd in ['be']: q['mod'].append('be') q['word'].append('be') # be clear that if node.elem == 'PP': return 1 if getq(comone) in ['NN', 'NP'] else 0 # search for key if node.elem == 'VP' and getq(comone) in ('NN', 'NP'): ad = 'V N' # cut down trees flag = {'edvm': ed or vm, 'advp': False} dfs_get(node, arg, q, ad, flag) if node == arg['common']: arg['ansm'] = q['mod'] arg['answ'] = q['word'] arg['ansp'] = q['pos'] if (q['ch']['v'] and q['ch']['w'] > 0) or (node == arg['common'] and len(arg['keylm']) > 1): addResult(arg, q) if not ed and not vm and flag['advp'] and \ (comone in ['VP', 'ADJP'] or getq(comone) in ['VB', 'JJ']): q = {'mod': [], 'word': [], 'pos': [], 'ch': {'v': True, 'w': 0, 'g': 0, 'sn': 1, 'sj': 1, 'sv': 1}} for child in node.children: if child.extra['tag'] == arg['tag']: sub = arg['ansm'] # q['mod'] += sub + '(%s) ' % arg['common'].elem lmod = len(q['mod']) q['pos'] += [lmod + i for i in arg['ansp']] q['mod'] += sub q['word'] += sub elif child.elem in ['ADVP', 'RB', 'RBR']: modeget(child, arg, q) else: pass if q['ch']['v'] and q['ch']['w'] > 0: addResult(arg, q) return 0 if ed or vm or br else 100
def dfs_root_tree(tree, arg): tree.extra['tag'] = -1 for child in tree.children: dfs_root_tree(child, arg) if len(tree.children) == 0: tree.extra['rep'] = dict(arg['tk'][int(tree.elem)]) elif len(tree.children[0].children) == 0: tree.extra['rep'] = dict(tree.children[0].extra['rep']) else: tree.extra['rep'] = dict(niltk) tex = tree.extra te = tree.elem tk = arg['tk'] stat = [0] * 5 for child in tree.children: cexr = dict(child.extra['rep']) if te == 'NP': if getq(child.elem) in ['NN', 'NP']: tex['rep'] = cexr elif child.elem == 'PP': c = child.children[0] if c.elem == 'IN': tkcc = tk[int(c.children[0].elem)]['l'] if tkcc == 'of': tex['rep'] = dict(niltk) elif te == 'VP': if child.elem == 'TO': stat[0] = 1 elif getq(child.elem) == 'VB': tex['rep'] = cexr tkc = tk[int(child.children[0].elem)]['l'] if tkc == 'have': stat[0] = 2 elif tkc == 'be': stat[0] = 3 elif tkc == 'do': stat[0] = 4 elif child.elem == 'MD': tkc = tk[int(child.children[0].elem)]['l'] if tkc in ['will', 'would']: stat[0] = 5 elif child.elem == 'VP': if stat[0] == 2 and cexr['p'] == 'VBN' or \ stat[0] == 3 and cexr['p'] == 'VBN' or \ stat[0] == 5 and cexr['p'] == 'VBP': tex['rep'] = cexr elif stat[0] == 4 and cexr['p'] == 'VBP': tex['rep'] = cexr elif stat[0] == 3 and cexr['p'] == 'VBG': tex['rep'] = cexr elif stat[0] != 0: tex['rep'] = dict(niltk) else: tex['rep'] = cexr elif te == 'ADJP': if child.elem in ['ADJP', 'JJ']: tex['rep'] = cexr elif te == 'ADVP': if child.elem in ['ADVP', 'RB', 'RBR']: tex['rep'] = cexr elif te == 'PP': if getq(child.elem) in ['VP', 'VB', 'NP', 'NN']: tex['rep'] = cexr elif te == 'PRT': tex['rep']['l'] += cexr['l'] + ' ' elif len(tree.children) == 1: tex['rep'] = cexr if tree.children[-1].elem == 'POS': tree.elem = 'PRP$' tex['rep'] = dict(niltk)
def modeget(child, arg, q, ad=''): c = q['ch'] c['v'] &= child.elem[0].isalpha() c['v'] &= child.elem not in ['MD', 'CC'] c['w'] += 1 e = child.elem pn = (len(q['mod']) == arg['nxt']) if len(q['mod']) == arg['nxt']: r = child.extra['rep']['l'] if not r: r = '_other_' else: r = '_other_' if e in ['CD', 'PDT', 'QP', 'PRN', 'DT', 'PRP$', 'POS']: c['w'] -= 1 elif ad == 'V N' and e in ['PRT']: c['w'] -= 1 q['word'].append(child.extra['rep']['l']) elif ad == 'VBG' and e in ['VBG', 'VBN']: c['g'] |= 1 q['mod'].append(e) q['word'].append(child.extra['rep']['t'] if pn else e) elif e in ['ADVP']: c['sv'] -= 1 if c['sv'] >= 0: q['mod'].append(disp.get(e, e)) q['word'].append(r if pn else disp.get(e, e)) else: c['w'] -= 1 elif len(child.children) == 1 and len(child.children[0].children) == 0: tk = arg['tk'][int(child.children[0].elem)]['l'] e = getq(child.elem) if e in ['IN', 'TO']: q['pos'].append(len(q['mod'])) q['mod'].append(tk) q['word'].append(tk) elif e in ['NN', 'JJ']: if e == 'NN': c['sn'] -= 1 if e == 'JJ': c['sj'] -= 1 e = 'JJ' if c['sn'] >= 0 and c['sj'] >= 0: q['mod'].append(disp.get(e, e)) q['word'].append(r if pn else disp.get(e, e)) else: c['w'] -= 1 elif ad == 'N V' and child.elem in ['VBG', 'VBN']: c['sj'] -= 1 e = 'JJ' if c['sj'] >= 0: q['mod'].append(disp.get(e, e)) q['word'].append(child.extra['rep']['t'] if pn else disp.get(e, e)) elif e in ['RB']: if tk == 'not' or c['sv'] <= 0: c['w'] -= 1 else: c['sv'] -= 1 q['mod'].append(disp.get(e, e)) q['word'].append(r if pn else disp.get(e, e)) elif e in ['VB']: if tk in ['be']: c['w'] -= 1 elif tk in ['have']: c['w'] -= 1 else: q['mod'].append(disp.get(e, e)) q['word'].append(r if pn else disp.get(e, e)) else: q['mod'].append(disp.get(e, e)) q['word'].append(r if pn else disp.get(e, e)) else: if e == 'ADJP': c['sj'] -= 1 if c['sj'] >= 0: q['mod'].append(disp.get(e, e)) q['word'].append(r if pn else disp.get(e, e)) else: c['w'] -= 1
def comnex_add(node, arg): q = {'mod': [], 'word': [], 'pos': [], 'ch': {'v': True, 'w': 0, 'g': 0, 'sn': 1, 'sj': 1, 'sv': 1}} comone = arg['common']['e'] tree = arg['tree'] ad = '' ed = vm = br = False childe = [tree[child]['e'] for child in node['c']] if tree[node['p']]['e'] == 'VP' and node['e'] == 'VP': cb = tree[tree[tree[node['p']]['c'][0]]['c'][0]] if len(cb['c']) == 0: wd = arg['tk'][int(cb['e'])][1] if wd in ['be']: q['pos'].append(len(q['mod'])) q['mod'].append('be') q['word'].append('be') ed = True # passive # if ts == 'VBN' and wd in ['have'] or ts == 'VBG' and wd in ['be'] \ # or ts == 'VB' and wd in ['will', 'would', 'to', 'do']: if wd in ['have', 'be', 'will', 'would', 'to', 'do']: br = True # break later # verb example if node['e'] in ['S', 'SBAR'] and ('ADJP' in childe or 'JJ' in childe): return 0 # make it clear (that) if comone in ['ADJP', 'JJ']: q['ch']['sj'] -= 1 if node['e'] == 'NP': if getq(comone) in ['NN', 'NP']: q['ch']['sn'] -= 1 ad = 'N V' # complicate NP if getq(comone) in ['VB', 'VP']: vm = True # keep the tense if node['e'] == 'ADJP' and comone in ['ADJP', 'JJ'] and tree[node['p']]['e'] == 'VP': if checkbe(node, arg): q['pos'].append(len(q['mod'])) q['mod'].append('be') q['word'].append('be') # be clear that if node['e'] == 'PP': return 1 if getq(comone) in ['NN', 'NP'] else 0 # search for key if node['e'] == 'VP' and getq(comone) in ('NN', 'NP'): ad = 'V N' # cut down trees flag = {'edvm': ed or vm, 'advp': False} dfs_get(node, arg, q, ad, flag) if node == arg['common']: arg['ansm'] = q['mod'] arg['answ'] = q['word'] arg['ansp'] = q['pos'] if (q['ch']['v'] and q['ch']['w'] > 0) or (node == arg['common'] and len(arg['keylm']) > 1): addResult(arg, q) if not ed and not vm and flag['advp'] and \ (comone in ['VP', 'ADJP'] or getq(comone) in ['VB', 'JJ']): q = {'mod': [], 'word': [], 'pos': [], 'ch': {'v': True, 'w': 0, 'g': 0, 'sn': 1, 'sj': 1, 'sv': 1}} for i_child in node['c']: child = tree[i_child] if child['x'].get('tag', -1) == arg['tag']: sub = arg['ansm'] # q['mod'] += sub + '(%s) ' % arg['common']['e'] lmod = len(q['mod']) q['pos'] += [lmod + i for i in arg['ansp']] q['mod'] += sub q['word'] += sub elif child['e'] in ['ADVP', 'RB', 'RBR']: modeget(child, arg, q) else: pass if q['ch']['v'] and q['ch']['w'] > 0: addResult(arg, q) return 0 if ed or vm or br else 100
def modeget(child, arg, q, ad=''): tks = arg['tk'] c = q['ch'] c['v'] &= child['e'][0].isalpha() c['v'] &= child['e'] not in ['MD', 'CC'] c['w'] += 1 e = child['e'] pn = (len(q['mod']) == arg['nxt']) if len(q['mod']) == arg['nxt']: r = tks[child['x']['r']][1] if not r: r = '_other_' else: r = '_other_' if e in ['CD', 'PDT', 'QP', 'PRN', 'DT', 'PRP$', 'POS']: c['w'] -= 1 elif ad == 'V N' and e in ['PRT']: c['w'] -= 1 q['word'].append(tks[child['x']['r']][1]) elif ad == 'VBG' and e in ['VBG', 'VBN']: c['g'] |= 1 q['mod'].append(e) q['word'].append(tks[child['x']['r']][0] if pn else e) elif e in ['ADVP']: c['sv'] -= 1 if c['sv'] >= 0: q['mod'].append(disp.get(e, e)) q['word'].append(r if pn else disp.get(e, e)) else: c['w'] -= 1 elif len(child['c']) == 1 and len(arg['tree'][child['c'][0]]['c']) == 0: tk = arg['tk'][int(arg['tree'][child['c'][0]]['e'])][1] e = getq(child['e']) if e in ['IN', 'TO']: q['pos'].append(len(q['mod'])) q['mod'].append(tk) q['word'].append(tk) elif e in ['NN', 'JJ']: if e == 'NN': c['sn'] -= 1 if e == 'JJ': c['sj'] -= 1 e = 'JJ' if c['sn'] >= 0 and c['sj'] >= 0: q['mod'].append(disp.get(e, e)) q['word'].append(r if pn else disp.get(e, e)) else: c['w'] -= 1 elif ad == 'N V' and child['e'] in ['VBG', 'VBN']: c['sj'] -= 1 e = 'JJ' if c['sj'] >= 0: q['mod'].append(disp.get(e, e)) q['word'].append(tks[child['x']['r']][0] if pn else disp.get(e, e)) elif e in ['RB']: if tk == 'not' or c['sv'] <= 0: c['w'] -= 1 else: c['sv'] -= 1 q['mod'].append(disp.get(e, e)) q['word'].append(r if pn else disp.get(e, e)) elif e in ['VB']: if tk in ['be']: c['w'] -= 1 elif tk in ['have']: c['w'] -= 1 else: q['mod'].append(disp.get(e, e)) q['word'].append(r if pn else disp.get(e, e)) else: q['mod'].append(disp.get(e, e)) q['word'].append(r if pn else disp.get(e, e)) else: if e == 'ADJP': c['sj'] -= 1 if c['sj'] >= 0: q['mod'].append(disp.get(e, e)) q['word'].append(r if pn else disp.get(e, e)) else: c['w'] -= 1
def checksuit(strc, strp): return strc == strp or (getq(strc), strp) in validpass