def _parse(sentences): ### 3.4.2 CoreNLP Parser from nltk.parse.corenlp import CoreNLPParser as CP parser = CP(url='http://localhost:9000') # example: # parse, = parser.raw_parse( # 'The quick brown fox jumps over the lazy dog.' # ) # print(parse.pretty_print()) parse = parser.raw_parse_sents(sentences) # print(sentences[5:6]) parse_trees = [] for itr_tree in parse: for tree in itr_tree: parse_trees.append(tree) # tree.pretty_print() # print() return parse_trees
def __init__(self, sentence): with CoreNLPServer(port=9000) as server: en_parser = CoreNLPParser() # sg = StanfordTokenizer(path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar') self.trans = googletrans.Translator() self.sentence = sentence result1 = self.trans.translate(sentence).text print(result1) # en_sencence = result1.split(".") # print(en_sencence) # tree = list(en_parser.raw_parse(result1)) iter = en_parser.raw_parse_sents([result1]) tree = [] while True: try: sub_tree = list(next(iter)) tree.append(sub_tree) except StopIteration: break print(len(tree)) self.tree = tree[0][0] self.rel = []
class Cassim(): """Cassim main class.""" def __init__(self): """ """ self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') self.parser = CoreNLPParser(url='http://localhost:9000') def convert_mytree(self, nltktree, pnode): """ """ global numnodes for node in nltktree: numnodes += 1 if type(node) is nltk.ParentedTree: tempnode = Node(node.label()) pnode.addkid(tempnode) self.convert_mytree(node, tempnode) return pnode def syntax_similarity_conversation(self, documents1): """Syntax similarity of each document with its before and after.""" global numnodes documents1parsed = [] # Detect sentences and parse them for d1 in tqdm(range(len(documents1))): tempsents = (self.sent_detector.tokenize(documents1[d1].strip())) for s in tempsents: if len(s.split()) > 70: documents1parsed.append("NA") break else: temp = list(self.parser.raw_parse_sents((tempsents))) for i in range(len(temp)): temp[i] = list(temp[i])[0] temp[i] = ParentedTree.convert(temp[i]) documents1parsed.append(list(temp)) results = [] for d1 in range(len(documents1parsed) - 1): d2 = d1 + 1 if documents1parsed[d1] == "NA" or documents1parsed[d2] == "NA": results.append(float('NaN')) continue costMatrix = [] for i in range(len(documents1parsed[d1])): numnodes = 0 tempnode = Node(documents1parsed[d1][i].root().label()) sentencedoc1 = self.convert_mytree(documents1parsed[d1][i], tempnode) temp_costMatrix = [] sen1nodes = numnodes for j in range(len(documents1parsed[d2])): numnodes = .0 tempnode = Node(documents1parsed[d2][j].root().label()) sentencedoc2 = self.convert_mytree(documents1parsed[d2][j], tempnode) ED = simple_distance(sentencedoc1, sentencedoc2) ED /= (numnodes + sen1nodes) temp_costMatrix.append(ED) costMatrix.append(temp_costMatrix) costMatrix = np.array(costMatrix) results.append(1 - np.mean(costMatrix)) return np.array(results)