def lev_sim(s1, s2): """ This function computes the Levenshtein similarity between the two input strings. Args: s1,s2 (string): The input strings for which the similarity measure should be computed. Returns: The Levenshtein similarity if both the strings are not missing (i.e NaN), else returns NaN. """ if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN # if isinstance(s1, six.string_types): # s1 = gh.remove_non_ascii(s1) # if isinstance(s2, six.string_types): # s2 = gh.remove_non_ascii(s2) # Create the similarity measure object measure = sm.Levenshtein() if not (isinstance(s1, six.string_types) or isinstance(s1, bytes)): s1 = str(s1) if not (isinstance(s2, six.string_types) or isinstance(s2, bytes)): s2 = str(s2) # Call the function to compute the similarity measure return measure.get_sim_score(s1, s2)
def matchHeaders(headers): jac = sm.Jaccard() lev = sm.Levenshtein() oc = sm.OverlapCoefficient() i = 0 j = 0 header_len = len(headers) for i in range(0, header_len - 1): for first in headers[i]: j = i + 1 if j == header_len: break for second in headers[j]: # print(first, '' , second, '') # i = i + 1 # if(i == header_len): # continue x = first y = second delim_tok = sm.DelimiterTokenizer(delim_set=['_']) jacScore = jac.get_sim_score(delim_tok.tokenize(x), delim_tok.tokenize(y)) levScore = lev.get_sim_score(x, y) ocScore = oc.get_sim_score(delim_tok.tokenize(x), delim_tok.tokenize(y)) if (ocScore == 1 or levScore >= 0.5 or jacScore >= 0.5): print(first + ' of Table' + str(i + 1) + ' and ' + second + ' of Table' + str(j + 1) + ' matched')
def lev_sim(s1, s2): """ This function computes the Levenshtein similarity between the two input strings. Args: s1,s2 (string): The input strings for which the similarity measure should be computed. Returns: The Levenshtein similarity if both the strings are not missing (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.lev_sim('alex', 'alxe') 0.5 >>> em.lev_dist(None, 'alex') nan """ if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN # Create the similarity measure object measure = sm.Levenshtein() s1 = gh.convert_to_str_unicode(s1) s2 = gh.convert_to_str_unicode(s2) # Call the function to compute the similarity measure return measure.get_sim_score(s1, s2)
def lev_score(self, str_pair, sim_score=True): """ calculate levenshtein similarity between two strings :return: similarity score or raw score (0 to 1) """ s1, s2 = self._check_input(str_pair) lev = sm.Levenshtein() return lev.get_sim_score(s1, s2) if sim_score else lev.get_raw_score( s1, s2)
def lev_sim(s1, s2): if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN s1 = helper.convert_to_str_unicode(s1) s2 = helper.convert_to_str_unicode(s2) measure = sm.Levenshtein() return measure.get_sim_score(s1, s2)
def __init__(self): self.similarity_function = [ sm.BagDistance(), sm.Cosine(), sm.Dice(), sm.Editex(), sm.GeneralizedJaccard(), sm.Jaccard(), sm.Jaro(), sm.JaroWinkler(), sm.Levenshtein(), sm.OverlapCoefficient(), sm.TverskyIndex() ] self.alphanumeric_tokenizer = sm.AlphanumericTokenizer(return_set=True)
def lev_sim(s1, s2): if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN # if isinstance(s1, six.string_types): # s1 = gh.remove_non_ascii(s1) # if isinstance(s2, six.string_types): # s2 = gh.remove_non_ascii(s2) # Create the similarity measure object measure = sm.Levenshtein() if not(isinstance(s1, six.string_types) or isinstance(s1, bytes)): s1 = str(s1) if not(isinstance(s2, six.string_types) or isinstance(s2, bytes)): s2 = str(s2) # Call the function to compute the similarity measure return measure.get_sim_score(s1, s2)
def lev_dist(s1, s2): """ This function computes the Levenshtein distance between the two input strings. Args: s1,s2 (string): The input strings for which the similarity measure should be computed. Returns: The Levenshtein distance if both the strings are not missing (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.lev_dist('alex', 'alxe') 2 >>> em.lev_dist(None, 'alex') nan """ if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN # if isinstance(s1, six.string_types): # s1 = gh.remove_non_ascii(s1) # if isinstance(s2, six.string_types): # s2 = gh.remove_non_ascii(s2) # Create the similarity measure object measure = sm.Levenshtein() if not (isinstance(s1, six.string_types) or isinstance(s1, bytes)): s1 = str(s1) if not (isinstance(s2, six.string_types) or isinstance(s2, bytes)): s2 = str(s2) # Call the function to compute the distance measure. return measure.get_raw_score(s1, s2)
nlmInsampleFile = 'NLMdata/dataCached/insample_abstracts_outfile' nlmOutsampleFile = 'NLMdata/dataCached/outSample_abstracts_outfile' nlmInsampleData = pickle.load(open(nlmInsampleFile, 'rb')) nlmOutsampleData = pickle.load(open(nlmOutsampleFile, 'rb')) # Instantiate FVComponent instances csAbstract = FVC.CosSim('CSAbs', TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True), False) csSentence = FVC.CosSim('CSSent', TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True), True) cosM = FVC.stringMatchExcerpts('CosMeasure', sm.Cosine(), sm.WhitespaceTokenizer(return_set=True)) LVDist = FVC.stringMatchTitles('LVDist', sm.Levenshtein()) FVCList = [csAbstract, csSentence, cosM, LVDist] def classifyAndPredict(insampleData, outsampleData, folderName, componentList): print len(insampleData[0]) print len(outsampleData[1]) # Declare instance of a join object with input arguments easyJoin = myJoin.join(insampleData, outsampleData, folderName) easyJoin.setComponentList(componentList) # Build feature vector easyJoin.buildInsampleFV() easyJoin.buildOutsampleFVReduced(0.01) # Classify and predict with logistic regression easyJoin.classify()
ed = sm.Editex() df['Editex'] = df.apply( lambda x: ed.get_sim_score(x['Sequence1'], x['Sequence2']), axis=1) df.head() # In[41]: jaro = sm.Jaro() df['Jaro'] = df.apply( lambda x: jaro.get_sim_score(x['Sequence1'], x['Sequence2']), axis=1) df.head() # In[42]: lev = sm.Levenshtein() df['Levenshtein'] = df.apply( lambda x: lev.get_sim_score(x['Sequence1'], x['Sequence2']), axis=1) df.head() # In[43]: nw = sm.NeedlemanWunsch() df['NeedlemanWunsch'] = df.apply( lambda x: nw.get_raw_score(x['Sequence1'], x['Sequence2']), axis=1) df.head() # In[44]: sw = sm.SmithWaterman() df['SmithWaterman'] = df.apply(
def __init__(self): self.lev = py_stringmatching.Levenshtein()
def markStudDFSBlockAnswer(processQuestionId, studentAnswerId): # Connect to Graph graph = connectToGraph() whiteSpaceTokenizer = py_stringmatching.WhitespaceTokenizer( return_set=True) jaccard = py_stringmatching.Jaccard() levenshtein = py_stringmatching.Levenshtein() teacherStartNodeKey = graph.data( "MATCH (node:Teacher) WHERE node.text='start' RETURN node.key") studentStartNodeKey = graph.data( "MATCH (node:Student) WHERE node.text='start' RETURN node.key") teachStack = [teacherStartNodeKey[0]['node.key']] studStack = [studentStartNodeKey[0]['node.key']] teachVisitedNodes = [] studVisitedNodes = [] # keeps track of the nodes matched in each level matchedTeacherNodes = [] matchedStudentNodes = [] notMatchedParentTeacherNodes = [] # keeps track of all the nodes visited throughout graph traversal and a node is added to this each time it is visited allMatchedTeachNodes = [] allMatchedStudNodes = [] additionalNodes = [] deletedNodes = [] substitutedNodes = [] addOrSubNodes = [] delOrSubNodes = [] totNoOfAdditionalNodes = 0 totNoOfDeletedNodes = 0 totNoOfSubstitutedNodes = 0 totNoOfOtherIncorrectNodes = 0 totNoOfOtherSubstitutedNodes = 0 totNoOfMatchedNodes = 0 feedback = "" while teachStack or studStack: if teachStack and studStack: teachCurrent = teachStack.pop() studCurrent = studStack.pop() teacherCurrentText = graph.data( "MATCH (node:Teacher) WHERE node.key= {key} RETURN node.text", parameters={"key": teachCurrent}) studentCurrentText = graph.data( "MATCH (node:Student) WHERE node.key= {key} RETURN node.text", parameters={"key": studCurrent}) teacherChildNodes = graph.data( "MATCH (parent:Teacher)-[:TO]->(child:Teacher) WHERE parent.key= {key} RETURN child", parameters={"key": teachCurrent}) #teacherStartNodeKey[0]['node.key'] studentChildNodes = graph.data( "MATCH (parent:Student)-[:TO]->(child:Student) WHERE parent.key= {key} RETURN child", parameters={"key": studCurrent}) #studentStartNodeKey[0]['node.key'] teachChildNodesList = list(teacherChildNodes) studChildNodesList = list(studentChildNodes) for teacherChild in teachChildNodesList: teachText = teacherChild['child']['text'] # teachTextTokens = whiteSpaceTokenizer.tokenize(teacherChild['child']['text']) print(teachText) matchFound = 'false' for studentChild in studChildNodesList: if not studentChild['child']['key'] in matchedStudentNodes: print('current stud child') print(studentChild['child']['text']) childText = studentChild['child']['text'] synsetSim_score = getPhraseSimilarity( teachText, childText) if re.match(teachText, childText, re.IGNORECASE) or synsetSim_score >= 0.55: print( 'threshold similarity added to Student stack') feedback = feedback + 'The block:' + studentChild['child']['text'] + \ ' connected to block:' + studentCurrentText[0]['node.text'] + ' is correct. ' matchFound = 'true' if not teacherChild['child'][ 'key'] in teachVisitedNodes: studStack.append(studentChild['child']['key']) teachStack.append(teacherChild['child']['key']) if not studentChild['child'][ 'key'] in allMatchedStudNodes and not studentChild[ 'child']['text'] == 'end': totNoOfMatchedNodes = totNoOfMatchedNodes + 1 allMatchedTeachNodes.append( teacherChild['child']['key']) allMatchedStudNodes.append( studentChild['child']['key']) if len(teachChildNodesList) > len( studChildNodesList): matchedTeacherNodes.append( teacherChild['child']['key']) # add to student matched node set too to check while looping through the current level children (above) matchedStudentNodes.append( studentChild['child']['key']) elif len(teachChildNodesList) < len( studChildNodesList): matchedStudentNodes.append( studentChild['child']['key']) else: matchedStudentNodes.append( studentChild['child']['key']) break if matchFound == 'false' and not teacherChild['child'][ 'key'] in teachVisitedNodes: # len(teachChildNodesList) == len(studChildNodesList) and notMatchedParentTeacherNodes.append( teacherChild['child']['key']) elif matchFound == 'false' and teacherChild['child'][ 'key'] in teachVisitedNodes: feedback = feedback + 'The block:' + teacherChild['child']['text'] + \ ' should be connected to block:' + teacherCurrentText[0]['node.text'] + '. ' totNoOfOtherIncorrectNodes = totNoOfOtherIncorrectNodes + 1 if len(teachChildNodesList) == len(studChildNodesList) and len( notMatchedParentTeacherNodes) == 1: print('^^^ONE SUBSTITUTED NODE') totNoOfSubstitutedNodes, totNoOfOtherIncorrectNodes, feedback = \ addTheOnlyUnmatchedNode('NotMatchedNode', graph, notMatchedParentTeacherNodes, teachStack, studChildNodesList, matchedStudentNodes, studStack, totNoOfSubstitutedNodes, feedback, studVisitedNodes, teachCurrent, studentCurrentText[0]['node.text'], totNoOfOtherIncorrectNodes) elif len(teachChildNodesList) == len(studChildNodesList) and len( notMatchedParentTeacherNodes) > 1: totNoOfSubstitutedNodes = totNoOfSubstitutedNodes + len( notMatchedParentTeacherNodes) againNotMatchedTeacherNodes, handledStudentNodeList, feedback = checkForCurrentNodeChildMatch( 'substitutedCaller', graph, matchedStudentNodes, notMatchedParentTeacherNodes, studChildNodesList, studVisitedNodes, studStack, teachStack, feedback, studentCurrentText[0]['node.text']) if len(againNotMatchedTeacherNodes) == 1: totNoOfOtherIncorrectNodes, feedback = addTheOnlyUnmatchedNode( 'NotMatchedChildrenNode', graph, againNotMatchedTeacherNodes, teachStack, studChildNodesList, handledStudentNodeList, studStack, totNoOfSubstitutedNodes, feedback, studVisitedNodes, teachCurrent, studentCurrentText[0]['node.text'], totNoOfOtherIncorrectNodes) elif len(againNotMatchedTeacherNodes) > 1: for studentChild in studChildNodesList: if not studentChild['child'][ 'key'] in handledStudentNodeList and not studentChild[ 'child']['key'] in studVisitedNodes: feedback = feedback + 'The block:' + studentChild['child']['text'] + \ ' connected to block:' + studentCurrentText[0]['node.text'] + ' is substituted, and it ' for againNotTeacherNode in againNotMatchedTeacherNodes: teacherNodeText = graph.data( "MATCH (node:Teacher) WHERE node.key= {key} RETURN node.text", parameters={"key": againNotTeacherNode}) feedback = feedback + ' should be:' + teacherNodeText[ 0]['node.text'] + ' or' feedback = feedback + ' one of the mentioned blocks. The immediate blocks that follow ' +\ 'this block:' + studentChild['child']['text'] + ' are also wrong. Please check them. ' substitutedNodes.append( studentChild['child']['key']) # handles scenario where student graph has deleted child nodes for the current node under consideration if len(teachChildNodesList) > len(studChildNodesList): totNoOfDeletedNodes = totNoOfDeletedNodes + ( len(teachChildNodesList) - len(studChildNodesList)) if len(matchedStudentNodes) == len(studChildNodesList): for child in teachChildNodesList: if not child['child'][ 'key'] in matchedTeacherNodes and not child[ 'child']['key'] in teachVisitedNodes: feedback = feedback + 'Missing Block:' + child['child']['text'] + \ ' should be connected to block:' + studentCurrentText[0]['node.text'] + '. ' deletedNodes.append(child['child']['key']) elif len(matchedStudentNodes) < len(studChildNodesList): feedback = feedback + 'There is/are ' + str(len(teachChildNodesList) - len(studChildNodesList)) + \ ' missing block(s) that should be connected to block:' + studentCurrentText[0]['node.text'] + \ ' and ' + str(len(studChildNodesList) - len(matchedStudentNodes)) + \ ' block(s) connected to block:' + studentCurrentText[0]['node.text'] + \ ' is/are substituted - The incorrect blocks are ' againNotMatchedTeacherNodes, handledStudentNodeList, feedback = checkForCurrentNodeChildMatch( 'deletedSubstitutedCaller', graph, matchedStudentNodes, notMatchedParentTeacherNodes, studChildNodesList, studVisitedNodes, studStack, teachStack, feedback, studentCurrentText[0]['node.text']) if len(handledStudentNodeList) == len(studChildNodesList): for child in teachChildNodesList: if child['child'][ 'key'] in againNotMatchedTeacherNodes and not child[ 'child']['key'] in teachVisitedNodes: feedback = feedback + 'block:' + child['child']['text'] + \ ' that should be connected to block:' + studentCurrentText[0]['node.text'] +\ ' is missing and ' deletedNodes.append(child['child']['key']) elif len(handledStudentNodeList) < len(studChildNodesList): for child in teachChildNodesList: if child['child'][ 'key'] in againNotMatchedTeacherNodes and not child[ 'child']['key'] in teachVisitedNodes: feedback = feedback + ' block:' + child['child']['text'] + \ ' that should be/is connected to block:' + studentCurrentText[0]['node.text'] + \ ' is deleted/substituted and the immediate child blocks of this block are also wrong, please check them, and ' delOrSubNodes.append(child['child']['key']) feedback = feedback + 'please check all these incorrect blocks. ' # handles scenario where student graph has additional child nodes for the current node under consideration elif len(teachChildNodesList) < len(studChildNodesList): totNoOfAdditionalNodes = totNoOfAdditionalNodes + ( len(studChildNodesList) - len(teachChildNodesList)) # handles scenario where all teacher nodes are matched and there are additional nodes if len(matchedStudentNodes) == len(teachChildNodesList): for child in studChildNodesList: if not child['child'][ 'key'] in matchedStudentNodes and not child[ 'child']['key'] in studVisitedNodes: feedback = feedback + 'Additional Block:' + child['child']['text'] +\ ' is connected to block:' + studentCurrentText[0]['node.text'] + '. ' additionalNodes.append(child['child']['key']) elif not child['child'][ 'key'] in matchedStudentNodes and child[ 'child']['key'] in studVisitedNodes: feedback = feedback + 'Additional connection from block:' + studentCurrentText[0]['node.text'] +\ ' to block:' + child['child']['text'] + '. ' elif len(matchedStudentNodes) < len(teachChildNodesList): feedback = feedback + 'There is/are ' + str(len(studChildNodesList) - len(teachChildNodesList)) + \ ' additional block(s) connected to block:' + studentCurrentText[0]['node.text'] + ' and ' +\ str(len(teachChildNodesList) - len(matchedStudentNodes)) +\ ' block(s) connected to block:' + studentCurrentText[0]['node.text'] + ' is/are substituted - The incorrect blocks are ' againNotMatchedTeacherNodes, handledStudentNodeList, feedback = checkForCurrentNodeChildMatch( 'additionalSubstitutedCaller', graph, matchedStudentNodes, notMatchedParentTeacherNodes, studChildNodesList, studVisitedNodes, studStack, teachStack, feedback, studentCurrentText[0]['node.text']) if len(handledStudentNodeList) == len( teachChildNodesList ): # len(againNotMatchedTeacherNodes) == (len(studChildNodesList)-len(teachChildNodesList)) for child in studChildNodesList: if not child['child'][ 'key'] in handledStudentNodeList and not child[ 'child']['key'] in studVisitedNodes: feedback = feedback + 'block:' + child['child']['text'] + ' connected to block:' +\ studentCurrentText[0]['node.text'] + ' is additional and ' additionalNodes.append(child['child']['key']) elif len(handledStudentNodeList) < len( teachChildNodesList ): # len(againNotMatchedTeacherNodes) > (len(studChildNodesList)-len(teachChildNodesList)) for child in studChildNodesList: if not child['child'][ 'key'] in handledStudentNodeList and not child[ 'child']['key'] in studVisitedNodes: feedback = feedback + ' block: ' + child['child']['text'] + ' connected to block:' +\ studentCurrentText[0]['node.text'] +\ ' is additional/substituted and the immediate child blocks of this block are also wrong, please check them, and ' addOrSubNodes.append(child['child']['key']) feedback = feedback + 'please check all these incorrect blocks. ' matchedTeacherNodes = [] matchedStudentNodes = [] notMatchedParentTeacherNodes = [] teachVisitedNodes.append(teachCurrent) studVisitedNodes.append(studCurrent) elif studStack and not teachStack: print('^^^^^^^^^^^^^^^STUDENT stack has moreeee.....') break # handles additional nodes down an additional node starting path if additionalNodes: feedback, totNoOfAdditionalNodes = detectUndetectedBlocks( "additionalNodes", graph, additionalNodes, studVisitedNodes, feedback, totNoOfAdditionalNodes) # handles deleted nodes down a deleted node starting path if deletedNodes: feedback, totNoOfDeletedNodes = detectUndetectedBlocks( "deletedNodes", graph, deletedNodes, teachVisitedNodes, feedback, totNoOfDeletedNodes) # handles substituted nodes down a substituted node starting path if substitutedNodes: feedback, totNoOfOtherSubstitutedNodes = detectUndetectedBlocks( "substitutedNodes", graph, substitutedNodes, studVisitedNodes, feedback, totNoOfOtherSubstitutedNodes) # handles additional/substituted nodes down a additional/substituted node starting path if addOrSubNodes: feedback, totNoOfOtherIncorrectNodes = detectUndetectedBlocks( "addOrSubNodes", graph, addOrSubNodes, studVisitedNodes, feedback, totNoOfOtherIncorrectNodes) # handles deleted/substituted nodes down a deleted/substituted node starting path if delOrSubNodes: feedback, totNoOfOtherIncorrectNodes = detectUndetectedBlocks( "delOrSubNodes", graph, delOrSubNodes, teachVisitedNodes, feedback, totNoOfOtherIncorrectNodes) if totNoOfAdditionalNodes == 0 and totNoOfDeletedNodes == 0 and totNoOfSubstitutedNodes == 0 and \ totNoOfOtherSubstitutedNodes == 0 and totNoOfOtherIncorrectNodes == 0: print(totNoOfMatchedNodes) feedback = feedback + "Excellent Job! All the blocks and the flow are correct!" # Number of correct blocks: " + ". " print(feedback) else: feedback = feedback + "Number of correct blocks except start and end blocks: " + str( totNoOfMatchedNodes) + ". " print(feedback) allocateMarksAndSaveToDatabase(totNoOfMatchedNodes, totNoOfAdditionalNodes, totNoOfDeletedNodes, totNoOfSubstitutedNodes, totNoOfOtherSubstitutedNodes, totNoOfOtherIncorrectNodes, feedback, processQuestionId, studentAnswerId)