Пример #1
0
def jaccard(arr1, arr2):
    """
    This function computes the Jaccard measure between the two input
    lists/sets.

    Args:
        arr1,arr2 (list or set): The input list or sets for which the Jaccard
            measure should be computed.

    Returns:
        The Jaccard measure if both the lists/set are not None and do not have
        any missing tokens (i.e NaN), else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.jaccard(['data', 'science'], ['data'])
        0.5
        >>> em.jaccard(['data', 'science'], None)
        nan
    """

    if arr1 is None or arr2 is None:
        return pd.np.NaN
    if not isinstance(arr1, list):
        arr1 = [arr1]
    if any(pd.isnull(arr1)):
        return pd.np.NaN
    if not isinstance(arr2, list):
        arr2 = [arr2]
    if any(pd.isnull(arr2)):
        return pd.np.NaN
    # Create jaccard measure object
    measure = sm.Jaccard()
    # Call a function to compute a similarity score
    return measure.get_raw_score(arr1, arr2)
Пример #2
0
def matchHeaders(headers):
    jac = sm.Jaccard()
    lev = sm.Levenshtein()
    oc = sm.OverlapCoefficient()

    i = 0
    j = 0

    header_len = len(headers)

    for i in range(0, header_len - 1):
        for first in headers[i]:
            j = i + 1
            if j == header_len:
                break
            for second in headers[j]:
                #                print(first, '' , second, '')
                #        i = i + 1
                #        if(i == header_len):
                #           continue
                x = first
                y = second
                delim_tok = sm.DelimiterTokenizer(delim_set=['_'])
                jacScore = jac.get_sim_score(delim_tok.tokenize(x),
                                             delim_tok.tokenize(y))
                levScore = lev.get_sim_score(x, y)
                ocScore = oc.get_sim_score(delim_tok.tokenize(x),
                                           delim_tok.tokenize(y))

                if (ocScore == 1 or levScore >= 0.5 or jacScore >= 0.5):
                    print(first + ' of Table' + str(i + 1) + ' and ' + second +
                          ' of Table' + str(j + 1) + ' matched')
Пример #3
0
 def jac_score(self, str_pair, sim_score=True) -> float:
     """
     calculate jaccard similarity between two single sets of tokens
     :return: similarity score (0 to 1)
     """
     e1, e2 = self._check_input(str_pair, type_=list)
     jac = sm.Jaccard()
     return jac.get_sim_score(e1, e2) if sim_score else jac.get_raw_score(
         e1, e2)
Пример #4
0
 def setUp(self):
     self.df = read_data(path_big_ten)
     self.trigramtok = sm.QgramTokenizer(qval=3)
     self.blocked_pairs = ssj.jaccard_join(self.df, self.df, 'id', 'id',
                                           'name', 'name', self.trigramtok,
                                           0.3)
     self.jaccsim = sm.Jaccard()
     self.sim_scores = get_sim_scores(self.df, self.blocked_pairs,
                                      self.trigramtok, self.jaccsim)
 def setUp(self):
     self.df = read_data(path_big_ten)
     self.trigramtok = sm.QgramTokenizer(qval=3)
     self.blocked_pairs = ssj.jaccard_join(self.df, self.df, 'id', 'id',
                                           'name', 'name', self.trigramtok,
                                           0.3)
     self.jaccsim = sm.Jaccard()
     self.sim_scores = get_sim_scores(self.df, self.blocked_pairs,
                                      self.trigramtok, self.jaccsim)
     self.sim_matrix = get_sim_matrix(self.df, self.sim_scores)
     self.aggcl = AgglomerativeClustering(n_clusters=5,
                                          affinity='precomputed',
                                          linkage='complete')
     self.labels = self.aggcl.fit_predict(self.sim_matrix)
Пример #6
0
def jaccard(arr1, arr2):
    if arr1 is None or arr2 is None:
        return pd.np.NaN
    if not isinstance(arr1, list):
        arr1 = [arr1]
    if any(pd.isnull(arr1)):
        return pd.np.NaN
    if not isinstance(arr2, list):
        arr2 = [arr2]
    if any(pd.isnull(arr2)):
        return pd.np.NaN
    # Create jaccard measure object
    measure = sm.Jaccard()
    # Call a function to compute a similarity score
    return measure.get_raw_score(arr1, arr2)
Пример #7
0
    def __init__(self):
        self.similarity_function = [
            sm.BagDistance(),
            sm.Cosine(),
            sm.Dice(),
            sm.Editex(),
            sm.GeneralizedJaccard(),
            sm.Jaccard(),
            sm.Jaro(),
            sm.JaroWinkler(),
            sm.Levenshtein(),
            sm.OverlapCoefficient(),
            sm.TverskyIndex()
        ]

        self.alphanumeric_tokenizer = sm.AlphanumericTokenizer(return_set=True)
Пример #8
0
def jac_q3_sim(str1, str2):
    try:
        # not needed as we already casted all to string and
        # lower cased and stripped all values before handing it over
        #str1 = str(str1).lower().strip()
        #str2 = str(str2).lower().strip()
        # assign a sim score of -1 when one of them is null
        if (str1 == 'nan' or str2 == 'nan' or str1 == '' or str2 == ''):
            return -1
        else:
            q3_tok = sm.QgramTokenizer(qval=3, return_set=True)
            jac = sm.Jaccard()
            return jac.get_raw_score(q3_tok.tokenize(str1),
                                     q3_tok.tokenize(str2))
    except:
        logger.warning('Issue with Jaccard_q3_Sim, hence -1 assigned')
        return -1
Пример #9
0
    def get_oov_jaccard_sim(self, s1, s2):
        en_tokens_f = word_tokenize(s1.lower())
        de_tokens_f = word_tokenize(s2.lower())

        # Replacing the OOVs if their match has found
        en_tokens = []
        for token in en_tokens_f:
            if token in self.en_oov:
                for el in self.en_oov[token]:
                    en_tokens.append(el)
            else:
                en_tokens.append(token)

        de_tokens = []
        for token in de_tokens_f:
            if token in self.de_oov:
                for el in self.de_oov[token]:
                    de_tokens.append(el)
            else:
                de_tokens.append(token)

        new_en_tokens = [
            token for token in en_tokens
            if token not in self.en_dictionary and token not in self.en_oov
        ]
        new_de_tokens = [
            token for token in de_tokens
            if token not in self.de_dictionary and token not in self.de_oov
        ]

        new_en_str = " ".join(new_en_tokens)
        new_de_str = " ".join(new_de_tokens)

        if new_en_str == "" or new_de_str == "":
            return 0

        ## Getting 3 - grams

        measure = sm.QgramTokenizer(qval=3)
        en_grams = measure.tokenize(new_en_str)
        de_grams = measure.tokenize(new_de_str)

        ## Getting Jaccard distance

        measure = sm.Jaccard()
        return measure.get_raw_score(en_grams, de_grams)
Пример #10
0
def main():
	import pickle
	import py_stringmatching as sm
	from sklearn.feature_extraction.text import TfidfVectorizer 
	INSAMPLE_ABS_OUTFILE = '../dataCached/insample_abstracts_outfile'
	OUTSAMPLE_ABS_OUTFILE = '../dataCached/outSample_abstracts_outfile'
	OUTSAMPLE_ABS_REDUCED_OUTFILE = '../dataCached/outSample_abstracts_reduced_outfile'
	a1 = pickle.load(open(INSAMPLE_ABS_OUTFILE,'rb'))
	a2 = pickle.load(open(OUTSAMPLE_ABS_OUTFILE,'rb'))
	a3 = pickle.load(open(OUTSAMPLE_ABS_REDUCED_OUTFILE,'rb'))
	csAbstract = CosSim('Cos Sim Abstract',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),False)
	csSentence = CosSim('Cos Sim Sentence',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),True)
	jacq3 = stringMatchExcerpts('Fuzzy Jaccard',sm.Jaccard(),sm.QgramTokenizer(qval=3))
	
	components = [csAbstract,csSentence,jacq3]
	a1Features = [c.generateFeatures(a1) for c in components]
	print len(a1Features)
Пример #11
0
import numpy as np
import os

def ensure_dir(file_path):
    directory = os.path.dirname(file_path)
    if not os.path.exists(directory):
        os.makedirs(directory)


INSAMPLE_FV_OUTFILE = 'dataCached/insampleFV_outfile'
OUTSAMPLE_FV_OUTFILE = 'dataCached/outsampleFV_outfile'
OUTSAMPLE_FV_REDUCED_OUTFILE = 'dataCached/outsampleFVreduced_outfile'

csAbstract = FVC.CosSim('CSAbs',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),False)
csSentence = FVC.CosSim('CSSent',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),True)
jacq3 = FVC.stringMatchExcerpts('FuzzJacc',sm.Jaccard(),sm.QgramTokenizer(qval=3,return_set = True))
cosM = FVC.stringMatchExcerpts('CosMeasure',sm.Cosine(),sm.WhitespaceTokenizer(return_set = True))
cosMq3 = FVC.stringMatchExcerpts('FuzzCosMeasure',sm.Cosine(),sm.QgramTokenizer(return_set = True))
LVdist = FVC.stringMatchTitles('LVDist',sm.Levenshtein())

DEFAULTFV = [jacq3,cosM,cosMq3,LVdist]
DEFAULTMODEL = LR()
DEFAULTMODELNAME = 'LogisiticRegression'
DEFAULTITERATIONS = 25


class join:
    def __init__(self,insampleData,outsampleData,dataFolder):
        self.insampleData = insampleData #pairs,labels,pairedAbstracts,pairedTitles
        self.outsampleData = outsampleData #pairs,labels,pairedAbstracts,pairedTitles
        self.dataFolder = dataFolder
Пример #12
0

df['Sequence1'] = df['aTokens'].apply(sentence)
df['Sequence2'] = df['bTokens'].apply(sentence)
df.head()

# In[26]:

get_ipython().system('pip install py_stringmatching')
import py_stringmatching as sm

# # Token Based Similarities

# In[27]:

jac = sm.Jaccard()
df['Jaccard'] = df.apply(
    lambda x: jac.get_sim_score(x['aTokens'], x['bTokens']), axis=1)
df.head()

# In[28]:

jaro = sm.Jaro()

# !pip install pyjarowinkler
# from pyjarowinkler import distance
# def jaro_similarity(word1, word2):
#   return distance.get_jaro_distance(word1, word2, winkler=False, scaling=0.1)


def jaccard_similarity_general(tokens1, tokens2):
Пример #13
0
def blocking_rules(x, y):
    # return True if x and y survive the blocking rules
    # x and y are pandas series
    x_directors = str(x['directors']).split(';')
    y_directors = str(y['directors']).split(';')
    
    x_writers = str(x['writers']).split(';')
    y_writers = str(y['writers']).split(';')
    x_actors = str(x['cast']).split(';')
    y_actors = str(y['cast']).split(';')
    director_match = False
    writer_match = False
    actor_match = False
    overlap_size = 0
    # create a tokenizer
    ws_tok = sm.WhitespaceTokenizer()
    # create a Jaccard similarity measure object
    jac = sm.Jaccard()
    for x_director in x_directors:
        if director_match == True:
            break
        else:
            # tokenize x_director using whitespace
            if x_director == 'nan':
                continue
            else:
                x_director = ws_tok.tokenize(x_director)
                for y_director in y_directors:
                    if y_director == 'nan':
                        continue
                    else:
                        # tokenize y_director using whitespace
                        y_director = ws_tok.tokenize(y_director)
                        if jac.get_sim_score(x_director, y_director) >= 0.8:
                            director_match == True
                            break
    for x_writer in x_writers:
        if writer_match == True:
            break
        else:
            if x_writer == 'nan':
                continue    
            else:
                x_writer = ws_tok.tokenize(x_writer)
                for y_writer in y_writers:
                    if y_writer == 'nan':
                        continue
                    else:
                        y_writer = ws_tok.tokenize(y_writer)
                        if jac.get_sim_score(x_writer, y_writer) >= 0.8:
                            writer_match = True
                            break
    for x_actor in x_actors:
        if actor_match == True:
            break
        else:
            if x_actor == 'nan':
                continue
            else:
                x_actor = ws_tok.tokenize(x_actor)
                for y_actor in y_actors:
                    if y_actor == 'nan':
                        continue
                    else:
                        y_actor = ws_tok.tokenize(y_actor)
                        if jac.get_sim_score(x_actor, y_actor) >= 0.8:
                            actor_match = True
                            break
    if actor_match == False and director_match == False and writer_match == False:
        return True
    else:
        return False    
Пример #14
0
import pandas as pd
from .util import suffix
import py_stringmatching as sm
from remp import string_matching

tokenizer = sm.QgramTokenizer(qval=2, return_set=True)
jaccard = sm.Jaccard()


def similarity_func_default(string1, string2):
    return jaccard.get_sim_score(tokenizer.tokenize(string1),
                                 tokenizer.tokenize(string2))


def construct_similarity_list(left_triples,
                              right_triples,
                              entity_candidates,
                              aligned_attributes=None,
                              similarity_func=None):
    if aligned_attributes is None:
        shared_attributes = set(left_triples['a'].unique())
        shared_attributes &= set(right_triples['a'].unique())
        shared_attributes = list(shared_attributes)
        aligned_attributes = pd.DataFrame({
            'a1': shared_attributes,
            'a2': shared_attributes
        })
    if 'attr_id' not in aligned_attributes:
        aligned_attributes['attr_id'] = aligned_attributes.index
    paired = pd.merge(entity_candidates, suffix(left_triples, '1'))
    paired = pd.merge(paired, aligned_attributes)
Пример #15
0
def markStudDFSBlockAnswer(processQuestionId, studentAnswerId):
    # Connect to Graph
    graph = connectToGraph()

    whiteSpaceTokenizer = py_stringmatching.WhitespaceTokenizer(
        return_set=True)
    jaccard = py_stringmatching.Jaccard()
    levenshtein = py_stringmatching.Levenshtein()

    teacherStartNodeKey = graph.data(
        "MATCH (node:Teacher) WHERE node.text='start' RETURN node.key")
    studentStartNodeKey = graph.data(
        "MATCH (node:Student) WHERE node.text='start' RETURN node.key")

    teachStack = [teacherStartNodeKey[0]['node.key']]
    studStack = [studentStartNodeKey[0]['node.key']]

    teachVisitedNodes = []
    studVisitedNodes = []

    # keeps track of the nodes matched in each level
    matchedTeacherNodes = []
    matchedStudentNodes = []

    notMatchedParentTeacherNodes = []

    # keeps track of all the nodes visited throughout graph traversal and a node is added to this each time it is visited
    allMatchedTeachNodes = []
    allMatchedStudNodes = []

    additionalNodes = []
    deletedNodes = []
    substitutedNodes = []
    addOrSubNodes = []
    delOrSubNodes = []

    totNoOfAdditionalNodes = 0
    totNoOfDeletedNodes = 0
    totNoOfSubstitutedNodes = 0
    totNoOfOtherIncorrectNodes = 0
    totNoOfOtherSubstitutedNodes = 0

    totNoOfMatchedNodes = 0

    feedback = ""

    while teachStack or studStack:

        if teachStack and studStack:

            teachCurrent = teachStack.pop()
            studCurrent = studStack.pop()

            teacherCurrentText = graph.data(
                "MATCH (node:Teacher) WHERE node.key= {key} RETURN node.text",
                parameters={"key": teachCurrent})

            studentCurrentText = graph.data(
                "MATCH (node:Student) WHERE node.key= {key} RETURN node.text",
                parameters={"key": studCurrent})

            teacherChildNodes = graph.data(
                "MATCH (parent:Teacher)-[:TO]->(child:Teacher) WHERE parent.key= {key} RETURN child",
                parameters={"key":
                            teachCurrent})  #teacherStartNodeKey[0]['node.key']

            studentChildNodes = graph.data(
                "MATCH (parent:Student)-[:TO]->(child:Student) WHERE parent.key= {key} RETURN child",
                parameters={"key":
                            studCurrent})  #studentStartNodeKey[0]['node.key']

            teachChildNodesList = list(teacherChildNodes)

            studChildNodesList = list(studentChildNodes)

            for teacherChild in teachChildNodesList:

                teachText = teacherChild['child']['text']
                # teachTextTokens = whiteSpaceTokenizer.tokenize(teacherChild['child']['text'])

                print(teachText)

                matchFound = 'false'

                for studentChild in studChildNodesList:
                    if not studentChild['child']['key'] in matchedStudentNodes:
                        print('current stud child')
                        print(studentChild['child']['text'])
                        childText = studentChild['child']['text']

                        synsetSim_score = getPhraseSimilarity(
                            teachText, childText)

                        if re.match(teachText, childText,
                                    re.IGNORECASE) or synsetSim_score >= 0.55:
                            print(
                                'threshold similarity added to Student stack')

                            feedback = feedback + 'The block:' + studentChild['child']['text'] + \
                                       ' connected to block:' + studentCurrentText[0]['node.text'] + ' is correct. '

                            matchFound = 'true'

                            if not teacherChild['child'][
                                    'key'] in teachVisitedNodes:
                                studStack.append(studentChild['child']['key'])

                                teachStack.append(teacherChild['child']['key'])

                                if not studentChild['child'][
                                        'key'] in allMatchedStudNodes and not studentChild[
                                            'child']['text'] == 'end':
                                    totNoOfMatchedNodes = totNoOfMatchedNodes + 1

                                allMatchedTeachNodes.append(
                                    teacherChild['child']['key'])
                                allMatchedStudNodes.append(
                                    studentChild['child']['key'])

                            if len(teachChildNodesList) > len(
                                    studChildNodesList):
                                matchedTeacherNodes.append(
                                    teacherChild['child']['key'])

                                # add to student matched node set too to check while looping through the current level children (above)
                                matchedStudentNodes.append(
                                    studentChild['child']['key'])
                            elif len(teachChildNodesList) < len(
                                    studChildNodesList):
                                matchedStudentNodes.append(
                                    studentChild['child']['key'])
                            else:
                                matchedStudentNodes.append(
                                    studentChild['child']['key'])

                            break

                if matchFound == 'false' and not teacherChild['child'][
                        'key'] in teachVisitedNodes:  # len(teachChildNodesList) == len(studChildNodesList) and
                    notMatchedParentTeacherNodes.append(
                        teacherChild['child']['key'])
                elif matchFound == 'false' and teacherChild['child'][
                        'key'] in teachVisitedNodes:
                    feedback = feedback + 'The block:' + teacherChild['child']['text'] + \
                               ' should be connected to block:' + teacherCurrentText[0]['node.text'] + '. '
                    totNoOfOtherIncorrectNodes = totNoOfOtherIncorrectNodes + 1

            if len(teachChildNodesList) == len(studChildNodesList) and len(
                    notMatchedParentTeacherNodes) == 1:

                print('^^^ONE SUBSTITUTED NODE')

                totNoOfSubstitutedNodes, totNoOfOtherIncorrectNodes, feedback = \
                    addTheOnlyUnmatchedNode('NotMatchedNode', graph, notMatchedParentTeacherNodes,
                                        teachStack, studChildNodesList, matchedStudentNodes,
                                        studStack, totNoOfSubstitutedNodes, feedback, studVisitedNodes,
                                        teachCurrent, studentCurrentText[0]['node.text'], totNoOfOtherIncorrectNodes)

            elif len(teachChildNodesList) == len(studChildNodesList) and len(
                    notMatchedParentTeacherNodes) > 1:

                totNoOfSubstitutedNodes = totNoOfSubstitutedNodes + len(
                    notMatchedParentTeacherNodes)

                againNotMatchedTeacherNodes, handledStudentNodeList, feedback = checkForCurrentNodeChildMatch(
                    'substitutedCaller', graph, matchedStudentNodes,
                    notMatchedParentTeacherNodes, studChildNodesList,
                    studVisitedNodes, studStack, teachStack, feedback,
                    studentCurrentText[0]['node.text'])

                if len(againNotMatchedTeacherNodes) == 1:
                    totNoOfOtherIncorrectNodes, feedback = addTheOnlyUnmatchedNode(
                        'NotMatchedChildrenNode', graph,
                        againNotMatchedTeacherNodes, teachStack,
                        studChildNodesList, handledStudentNodeList, studStack,
                        totNoOfSubstitutedNodes, feedback, studVisitedNodes,
                        teachCurrent, studentCurrentText[0]['node.text'],
                        totNoOfOtherIncorrectNodes)

                elif len(againNotMatchedTeacherNodes) > 1:
                    for studentChild in studChildNodesList:
                        if not studentChild['child'][
                                'key'] in handledStudentNodeList and not studentChild[
                                    'child']['key'] in studVisitedNodes:
                            feedback = feedback + 'The block:' + studentChild['child']['text'] + \
                                               ' connected to block:' + studentCurrentText[0]['node.text'] + ' is substituted, and it '

                            for againNotTeacherNode in againNotMatchedTeacherNodes:
                                teacherNodeText = graph.data(
                                    "MATCH (node:Teacher) WHERE node.key= {key} RETURN node.text",
                                    parameters={"key": againNotTeacherNode})

                                feedback = feedback + ' should be:' + teacherNodeText[
                                    0]['node.text'] + ' or'

                            feedback = feedback + ' one of the mentioned blocks. The immediate blocks that follow ' +\
                                       'this block:' + studentChild['child']['text'] + ' are also wrong. Please check them. '

                            substitutedNodes.append(
                                studentChild['child']['key'])

            # handles scenario where student graph has deleted child nodes for the current node under consideration
            if len(teachChildNodesList) > len(studChildNodesList):
                totNoOfDeletedNodes = totNoOfDeletedNodes + (
                    len(teachChildNodesList) - len(studChildNodesList))

                if len(matchedStudentNodes) == len(studChildNodesList):
                    for child in teachChildNodesList:
                        if not child['child'][
                                'key'] in matchedTeacherNodes and not child[
                                    'child']['key'] in teachVisitedNodes:
                            feedback = feedback + 'Missing Block:' + child['child']['text'] + \
                                               ' should be connected to block:' + studentCurrentText[0]['node.text'] + '. '
                            deletedNodes.append(child['child']['key'])
                elif len(matchedStudentNodes) < len(studChildNodesList):
                    feedback = feedback + 'There is/are ' + str(len(teachChildNodesList) - len(studChildNodesList)) + \
                               ' missing block(s) that should be connected to block:' + studentCurrentText[0]['node.text'] + \
                               ' and ' + str(len(studChildNodesList) - len(matchedStudentNodes)) + \
                               ' block(s) connected to block:' + studentCurrentText[0]['node.text'] + \
                               ' is/are substituted - The incorrect blocks are '

                    againNotMatchedTeacherNodes, handledStudentNodeList, feedback = checkForCurrentNodeChildMatch(
                        'deletedSubstitutedCaller', graph, matchedStudentNodes,
                        notMatchedParentTeacherNodes, studChildNodesList,
                        studVisitedNodes, studStack, teachStack, feedback,
                        studentCurrentText[0]['node.text'])

                    if len(handledStudentNodeList) == len(studChildNodesList):
                        for child in teachChildNodesList:
                            if child['child'][
                                    'key'] in againNotMatchedTeacherNodes and not child[
                                        'child']['key'] in teachVisitedNodes:
                                feedback = feedback + 'block:' + child['child']['text'] + \
                                           ' that should be connected to block:' + studentCurrentText[0]['node.text'] +\
                                           ' is missing and '
                                deletedNodes.append(child['child']['key'])

                    elif len(handledStudentNodeList) < len(studChildNodesList):
                        for child in teachChildNodesList:
                            if child['child'][
                                    'key'] in againNotMatchedTeacherNodes and not child[
                                        'child']['key'] in teachVisitedNodes:
                                feedback = feedback + ' block:' + child['child']['text'] + \
                                           ' that should be/is connected to block:' + studentCurrentText[0]['node.text'] + \
                                           ' is deleted/substituted and the immediate child blocks of this block are also wrong, please check them, and '

                                delOrSubNodes.append(child['child']['key'])

                    feedback = feedback + 'please check all these incorrect blocks. '

            # handles scenario where student graph has additional child nodes for the current node under consideration
            elif len(teachChildNodesList) < len(studChildNodesList):
                totNoOfAdditionalNodes = totNoOfAdditionalNodes + (
                    len(studChildNodesList) - len(teachChildNodesList))

                # handles scenario where all teacher nodes are matched and there are additional nodes
                if len(matchedStudentNodes) == len(teachChildNodesList):
                    for child in studChildNodesList:
                        if not child['child'][
                                'key'] in matchedStudentNodes and not child[
                                    'child']['key'] in studVisitedNodes:
                            feedback = feedback + 'Additional Block:' + child['child']['text'] +\
                                       ' is connected to block:' + studentCurrentText[0]['node.text'] + '. '
                            additionalNodes.append(child['child']['key'])
                        elif not child['child'][
                                'key'] in matchedStudentNodes and child[
                                    'child']['key'] in studVisitedNodes:
                            feedback = feedback + 'Additional connection from block:' + studentCurrentText[0]['node.text'] +\
                                       ' to block:' + child['child']['text'] + '. '
                elif len(matchedStudentNodes) < len(teachChildNodesList):
                    feedback = feedback + 'There is/are ' + str(len(studChildNodesList) - len(teachChildNodesList)) + \
                               ' additional block(s) connected to block:' + studentCurrentText[0]['node.text'] + ' and ' +\
                               str(len(teachChildNodesList) - len(matchedStudentNodes)) +\
                               ' block(s) connected to block:' + studentCurrentText[0]['node.text'] + ' is/are substituted - The incorrect blocks are '

                    againNotMatchedTeacherNodes, handledStudentNodeList, feedback = checkForCurrentNodeChildMatch(
                        'additionalSubstitutedCaller', graph,
                        matchedStudentNodes, notMatchedParentTeacherNodes,
                        studChildNodesList, studVisitedNodes, studStack,
                        teachStack, feedback,
                        studentCurrentText[0]['node.text'])

                    if len(handledStudentNodeList) == len(
                            teachChildNodesList
                    ):  # len(againNotMatchedTeacherNodes) == (len(studChildNodesList)-len(teachChildNodesList))
                        for child in studChildNodesList:
                            if not child['child'][
                                    'key'] in handledStudentNodeList and not child[
                                        'child']['key'] in studVisitedNodes:
                                feedback = feedback + 'block:' + child['child']['text'] + ' connected to block:' +\
                                           studentCurrentText[0]['node.text'] + ' is additional and '
                                additionalNodes.append(child['child']['key'])

                    elif len(handledStudentNodeList) < len(
                            teachChildNodesList
                    ):  # len(againNotMatchedTeacherNodes) > (len(studChildNodesList)-len(teachChildNodesList))
                        for child in studChildNodesList:
                            if not child['child'][
                                    'key'] in handledStudentNodeList and not child[
                                        'child']['key'] in studVisitedNodes:
                                feedback = feedback + ' block: ' + child['child']['text'] + ' connected to block:' +\
                                           studentCurrentText[0]['node.text'] +\
                                ' is additional/substituted and the immediate child blocks of this block are also wrong, please check them, and '

                                addOrSubNodes.append(child['child']['key'])

                    feedback = feedback + 'please check all these incorrect blocks. '

            matchedTeacherNodes = []
            matchedStudentNodes = []

            notMatchedParentTeacherNodes = []

            teachVisitedNodes.append(teachCurrent)
            studVisitedNodes.append(studCurrent)

        elif studStack and not teachStack:
            print('^^^^^^^^^^^^^^^STUDENT stack has moreeee.....')
            break

    # handles additional nodes down an additional node starting path
    if additionalNodes:
        feedback, totNoOfAdditionalNodes = detectUndetectedBlocks(
            "additionalNodes", graph, additionalNodes, studVisitedNodes,
            feedback, totNoOfAdditionalNodes)

    # handles deleted nodes down a deleted node starting path
    if deletedNodes:
        feedback, totNoOfDeletedNodes = detectUndetectedBlocks(
            "deletedNodes", graph, deletedNodes, teachVisitedNodes, feedback,
            totNoOfDeletedNodes)

    # handles substituted nodes down a substituted node starting path
    if substitutedNodes:
        feedback, totNoOfOtherSubstitutedNodes = detectUndetectedBlocks(
            "substitutedNodes", graph, substitutedNodes, studVisitedNodes,
            feedback, totNoOfOtherSubstitutedNodes)

    # handles additional/substituted nodes down a additional/substituted node starting path
    if addOrSubNodes:
        feedback, totNoOfOtherIncorrectNodes = detectUndetectedBlocks(
            "addOrSubNodes", graph, addOrSubNodes, studVisitedNodes, feedback,
            totNoOfOtherIncorrectNodes)

    # handles deleted/substituted nodes down a deleted/substituted node starting path
    if delOrSubNodes:
        feedback, totNoOfOtherIncorrectNodes = detectUndetectedBlocks(
            "delOrSubNodes", graph, delOrSubNodes, teachVisitedNodes, feedback,
            totNoOfOtherIncorrectNodes)




    if totNoOfAdditionalNodes == 0 and totNoOfDeletedNodes == 0 and totNoOfSubstitutedNodes == 0 and \
            totNoOfOtherSubstitutedNodes == 0 and totNoOfOtherIncorrectNodes == 0:
        print(totNoOfMatchedNodes)
        feedback = feedback + "Excellent Job! All the blocks and the flow are correct!"  # Number of correct blocks: " + ". "
        print(feedback)
    else:
        feedback = feedback + "Number of correct blocks except start and end blocks: " + str(
            totNoOfMatchedNodes) + ". "
        print(feedback)

    allocateMarksAndSaveToDatabase(totNoOfMatchedNodes, totNoOfAdditionalNodes,
                                   totNoOfDeletedNodes,
                                   totNoOfSubstitutedNodes,
                                   totNoOfOtherSubstitutedNodes,
                                   totNoOfOtherIncorrectNodes, feedback,
                                   processQuestionId, studentAnswerId)
nlmTwoNamesInsample = pickle.load(open(nlmInsampleFile + 'secondName', 'rb'))
nlmTwoNamesOutsample = pickle.load(open(nlmOutsampleFile + 'secondName', 'rb'))

SOInsampleFile = 'stackoverflowdata/' + insample_data
SOOutsampleFile = 'stackoverflowdata/' + outsample_data
SOInsampleData = pickle.load(open(SOInsampleFile, 'rb'))
SOOutsampleData = pickle.load(open(SOOutsampleFile, 'rb'))

csAbstract = FVC.CosSim('CSAbs',
                        TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True),
                        False)
csSentence = FVC.CosSim('CSSent',
                        TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True),
                        True)
jac = FVC.stringMatchExcerpts('Jacc', sm.Jaccard(),
                              sm.WhitespaceTokenizer(return_set=True))
jacq3 = FVC.stringMatchExcerpts('FuzzJacc', sm.Jaccard(),
                                sm.QgramTokenizer(qval=3, return_set=True))
dice = FVC.stringMatchExcerpts('Dice', sm.Dice(),
                               sm.WhitespaceTokenizer(return_set=True))
diceq3 = FVC.stringMatchExcerpts('Dice', sm.Dice(),
                                 sm.QgramTokenizer(qval=3, return_set=True))
cosM = FVC.stringMatchExcerpts('CosMeasure', sm.Cosine(),
                               sm.WhitespaceTokenizer(return_set=True))
cosMq3 = FVC.stringMatchExcerpts('FuzzCosMeasure', sm.Cosine(),
                                 sm.QgramTokenizer(return_set=True))
LVdist = FVC.stringMatchTitles('LVDist', sm.Levenshtein())
sw = FVC.stringMatchTitles('SW', sm.SmithWaterman())
nw = FVC.stringMatchTitles('NW', sm.NeedlemanWunsch())
jw = FVC.stringMatchTitles('JW', sm.JaroWinkler())