Пример #1
0
def tok_wspace(input_string):
    """
    This function splits the input string into a list of tokens
    (based on the white space).

    Args:
        input_string (string): Input string that should be tokenized.

    Returns:
        A list of tokens, if the input string is not NaN ,
        else returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.tok_wspace('data science')
        ['data', 'science']
        >>> em.tok_wspace('data         science')
        ['data', 'science']
        >>> em.tok_wspace(None)
        nan


    """
    if pd.isnull(input_string):
        return pd.np.NaN

    # input_string = remove_non_ascii(input_string)
    if not (isinstance(input_string, six.string_types)
            or isinstance(input_string, bytes)):
        input_string = str(input_string)
    else:
        if isinstance(input_string, bytes):
            input_string = input_string.decode('utf-8')
    measure = sm.WhitespaceTokenizer()
    return measure.tokenize(input_string)
Пример #2
0
def tok_wspace(input_string):
    """
    This function splits the input string into a list of tokens
    (based on the white space).

    Args:
        input_string (string): Input string that should be tokenized.

    Returns:
        A list of tokens, if the input string is not NaN ,
        else returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.tok_wspace('data science')
        ['data', 'science']
        >>> em.tok_wspace('data         science')
        ['data', 'science']
        >>> em.tok_wspace(None)
        nan


    """
    if pd.isnull(input_string):
        return pd.np.NaN

    # input_string = remove_non_ascii(input_string)
    input_string = gh.convert_to_str_unicode(input_string)

    measure = sm.WhitespaceTokenizer()
    return measure.tokenize(input_string)
Пример #3
0
def candidate_matching(dataset):
    import tempfile
    cache_base_dir = tempfile.mkdtemp('remp')
    (l1, l2) = dataset.label
    labels_1 = dataset.attributes_1[
        dataset.attributes_1['a'] == l1][['s', 'v']]
    labels_2 = dataset.attributes_2[
        dataset.attributes_2['a'] == l2][['s', 'v']]

    labels_1['v'] = labels_1['v'].apply(
        str).apply(unidecode.unidecode).str.lower()
    labels_2['v'] = labels_2['v'].apply(
        str).apply(unidecode.unidecode).str.lower()
    tokenizer = sm.WhitespaceTokenizer(return_set=True)
    num_pairs, pair_files = jaccard_join(labels_1['v'], labels_2['v'], 
        labels_1['s'], labels_2['s'], tokenizer, 0.3, cache_base_dir + '/', n_jobs=-1)
    M_c = [pd.read_pickle(f) for f in pair_files]
    return pd.DataFrame(sum(M_c, []), columns=['s1', 's2']).drop_duplicates()
Пример #4
0
    def jaccard_similarity(self, dataset, threshold):

        df = add_key_reindex(dataset)
        # concatenate all columns and convert as one string
        # for each row with '*' as separator

        A = dataset.applymap(str)

        A = A.apply(lambda x: '*'.join(x.values.tolist()), axis=1)

        A = A.astype(str)

        A = A.str.replace(" ", "")

        df['row'] = A

        ssj.profile_table_for_join(df)

        ws = sm.WhitespaceTokenizer(return_set=True)

        # auto join
        output_pairs = ssj.jaccard_join(df, df, 'New_ID',
                                        'New_ID', 'row', 'row', ws,
                                        threshold, l_out_attrs=['row'],
                                        r_out_attrs=['row'], n_jobs=-1)

        dup = output_pairs[output_pairs['l_New_ID']
                           != output_pairs['r_New_ID']]

        dataset = df[~df['New_ID'].isin(dup['r_New_ID'])]

        dataset.drop(["New_ID", "row"], axis=1, inplace=True)

        print("Number of duplicate rows removed:", len(set(dup['r_New_ID'])))

        return dataset
Пример #5
0
SOOutsampleData = pickle.load(open(SOOutsampleFile, 'rb'))

nlmInsampleFile = 'NLMdata/dataCached/insample_abstracts_outfile'
nlmOutsampleFile = 'NLMdata/dataCached/outSample_abstracts_outfile'
nlmInsampleData = pickle.load(open(nlmInsampleFile, 'rb'))
nlmOutsampleData = pickle.load(open(nlmOutsampleFile, 'rb'))

# Instantiate FVComponent instances
csAbstract = FVC.CosSim('CSAbs',
                        TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True),
                        False)
csSentence = FVC.CosSim('CSSent',
                        TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True),
                        True)
cosM = FVC.stringMatchExcerpts('CosMeasure', sm.Cosine(),
                               sm.WhitespaceTokenizer(return_set=True))
LVDist = FVC.stringMatchTitles('LVDist', sm.Levenshtein())

FVCList = [csAbstract, csSentence, cosM, LVDist]


def classifyAndPredict(insampleData, outsampleData, folderName, componentList):
    print len(insampleData[0])
    print len(outsampleData[1])
    # Declare instance of a join object with input arguments
    easyJoin = myJoin.join(insampleData, outsampleData, folderName)
    easyJoin.setComponentList(componentList)
    # Build feature vector
    easyJoin.buildInsampleFV()
    easyJoin.buildOutsampleFVReduced(0.01)
    # Classify and predict with logistic regression
Пример #6
0
import os

def ensure_dir(file_path):
    directory = os.path.dirname(file_path)
    if not os.path.exists(directory):
        os.makedirs(directory)


INSAMPLE_FV_OUTFILE = 'dataCached/insampleFV_outfile'
OUTSAMPLE_FV_OUTFILE = 'dataCached/outsampleFV_outfile'
OUTSAMPLE_FV_REDUCED_OUTFILE = 'dataCached/outsampleFVreduced_outfile'

csAbstract = FVC.CosSim('CSAbs',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),False)
csSentence = FVC.CosSim('CSSent',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),True)
jacq3 = FVC.stringMatchExcerpts('FuzzJacc',sm.Jaccard(),sm.QgramTokenizer(qval=3,return_set = True))
cosM = FVC.stringMatchExcerpts('CosMeasure',sm.Cosine(),sm.WhitespaceTokenizer(return_set = True))
cosMq3 = FVC.stringMatchExcerpts('FuzzCosMeasure',sm.Cosine(),sm.QgramTokenizer(return_set = True))
LVdist = FVC.stringMatchTitles('LVDist',sm.Levenshtein())

DEFAULTFV = [jacq3,cosM,cosMq3,LVdist]
DEFAULTMODEL = LR()
DEFAULTMODELNAME = 'LogisiticRegression'
DEFAULTITERATIONS = 25


class join:
    def __init__(self,insampleData,outsampleData,dataFolder):
        self.insampleData = insampleData #pairs,labels,pairedAbstracts,pairedTitles
        self.outsampleData = outsampleData #pairs,labels,pairedAbstracts,pairedTitles
        self.dataFolder = dataFolder
        self.labels = insampleData[1]
Пример #7
0
def blocking_rules(x, y):
    # return True if x and y survive the blocking rules
    # x and y are pandas series
    x_directors = str(x['directors']).split(';')
    y_directors = str(y['directors']).split(';')
    
    x_writers = str(x['writers']).split(';')
    y_writers = str(y['writers']).split(';')
    x_actors = str(x['cast']).split(';')
    y_actors = str(y['cast']).split(';')
    director_match = False
    writer_match = False
    actor_match = False
    overlap_size = 0
    # create a tokenizer
    ws_tok = sm.WhitespaceTokenizer()
    # create a Jaccard similarity measure object
    jac = sm.Jaccard()
    for x_director in x_directors:
        if director_match == True:
            break
        else:
            # tokenize x_director using whitespace
            if x_director == 'nan':
                continue
            else:
                x_director = ws_tok.tokenize(x_director)
                for y_director in y_directors:
                    if y_director == 'nan':
                        continue
                    else:
                        # tokenize y_director using whitespace
                        y_director = ws_tok.tokenize(y_director)
                        if jac.get_sim_score(x_director, y_director) >= 0.8:
                            director_match == True
                            break
    for x_writer in x_writers:
        if writer_match == True:
            break
        else:
            if x_writer == 'nan':
                continue    
            else:
                x_writer = ws_tok.tokenize(x_writer)
                for y_writer in y_writers:
                    if y_writer == 'nan':
                        continue
                    else:
                        y_writer = ws_tok.tokenize(y_writer)
                        if jac.get_sim_score(x_writer, y_writer) >= 0.8:
                            writer_match = True
                            break
    for x_actor in x_actors:
        if actor_match == True:
            break
        else:
            if x_actor == 'nan':
                continue
            else:
                x_actor = ws_tok.tokenize(x_actor)
                for y_actor in y_actors:
                    if y_actor == 'nan':
                        continue
                    else:
                        y_actor = ws_tok.tokenize(y_actor)
                        if jac.get_sim_score(x_actor, y_actor) >= 0.8:
                            actor_match = True
                            break
    if actor_match == False and director_match == False and writer_match == False:
        return True
    else:
        return False    
Пример #8
0
    timeObj1=datetime.strptime(row[6].strip(), '%M:%S').time()
    row[6] = timeObj1
    timeObj1=datetime.strptime(row[10].strip(), '%M:%S').time()
    row[10] = timeObj1
    # iteration #2: trim whitespaces from artist and track labels
    row[3] = row[3].strip()
    row[4] = row[4].strip()
    row[7] = row[7].strip()
    row[8] = row[8].strip()
    sampledList.append(row)
f.close()

# Converting every row in to a feature vector
featList = []
label = []
ws = ps.WhitespaceTokenizer()
for item in sampledList:
    fi = []

    jaro1 = ps.Jaro()

    # iteration #3:
    # pull the feature value to zero if none of the token pairs from either artist strings have a high
    # enough similarity score
    f1 = 0
    for t1 in ws.tokenize(item[3]):
        if max([jaro1.get_raw_score(t1, t2) for t2 in ws.tokenize(item[7])]) > .75:
            f1 = jaro1.get_raw_score(item[3], item[7])
            break

    # iteration #3:
Пример #9
0
def markStudDFSBlockAnswer(processQuestionId, studentAnswerId):
    # Connect to Graph
    graph = connectToGraph()

    whiteSpaceTokenizer = py_stringmatching.WhitespaceTokenizer(
        return_set=True)
    jaccard = py_stringmatching.Jaccard()
    levenshtein = py_stringmatching.Levenshtein()

    teacherStartNodeKey = graph.data(
        "MATCH (node:Teacher) WHERE node.text='start' RETURN node.key")
    studentStartNodeKey = graph.data(
        "MATCH (node:Student) WHERE node.text='start' RETURN node.key")

    teachStack = [teacherStartNodeKey[0]['node.key']]
    studStack = [studentStartNodeKey[0]['node.key']]

    teachVisitedNodes = []
    studVisitedNodes = []

    # keeps track of the nodes matched in each level
    matchedTeacherNodes = []
    matchedStudentNodes = []

    notMatchedParentTeacherNodes = []

    # keeps track of all the nodes visited throughout graph traversal and a node is added to this each time it is visited
    allMatchedTeachNodes = []
    allMatchedStudNodes = []

    additionalNodes = []
    deletedNodes = []
    substitutedNodes = []
    addOrSubNodes = []
    delOrSubNodes = []

    totNoOfAdditionalNodes = 0
    totNoOfDeletedNodes = 0
    totNoOfSubstitutedNodes = 0
    totNoOfOtherIncorrectNodes = 0
    totNoOfOtherSubstitutedNodes = 0

    totNoOfMatchedNodes = 0

    feedback = ""

    while teachStack or studStack:

        if teachStack and studStack:

            teachCurrent = teachStack.pop()
            studCurrent = studStack.pop()

            teacherCurrentText = graph.data(
                "MATCH (node:Teacher) WHERE node.key= {key} RETURN node.text",
                parameters={"key": teachCurrent})

            studentCurrentText = graph.data(
                "MATCH (node:Student) WHERE node.key= {key} RETURN node.text",
                parameters={"key": studCurrent})

            teacherChildNodes = graph.data(
                "MATCH (parent:Teacher)-[:TO]->(child:Teacher) WHERE parent.key= {key} RETURN child",
                parameters={"key":
                            teachCurrent})  #teacherStartNodeKey[0]['node.key']

            studentChildNodes = graph.data(
                "MATCH (parent:Student)-[:TO]->(child:Student) WHERE parent.key= {key} RETURN child",
                parameters={"key":
                            studCurrent})  #studentStartNodeKey[0]['node.key']

            teachChildNodesList = list(teacherChildNodes)

            studChildNodesList = list(studentChildNodes)

            for teacherChild in teachChildNodesList:

                teachText = teacherChild['child']['text']
                # teachTextTokens = whiteSpaceTokenizer.tokenize(teacherChild['child']['text'])

                print(teachText)

                matchFound = 'false'

                for studentChild in studChildNodesList:
                    if not studentChild['child']['key'] in matchedStudentNodes:
                        print('current stud child')
                        print(studentChild['child']['text'])
                        childText = studentChild['child']['text']

                        synsetSim_score = getPhraseSimilarity(
                            teachText, childText)

                        if re.match(teachText, childText,
                                    re.IGNORECASE) or synsetSim_score >= 0.55:
                            print(
                                'threshold similarity added to Student stack')

                            feedback = feedback + 'The block:' + studentChild['child']['text'] + \
                                       ' connected to block:' + studentCurrentText[0]['node.text'] + ' is correct. '

                            matchFound = 'true'

                            if not teacherChild['child'][
                                    'key'] in teachVisitedNodes:
                                studStack.append(studentChild['child']['key'])

                                teachStack.append(teacherChild['child']['key'])

                                if not studentChild['child'][
                                        'key'] in allMatchedStudNodes and not studentChild[
                                            'child']['text'] == 'end':
                                    totNoOfMatchedNodes = totNoOfMatchedNodes + 1

                                allMatchedTeachNodes.append(
                                    teacherChild['child']['key'])
                                allMatchedStudNodes.append(
                                    studentChild['child']['key'])

                            if len(teachChildNodesList) > len(
                                    studChildNodesList):
                                matchedTeacherNodes.append(
                                    teacherChild['child']['key'])

                                # add to student matched node set too to check while looping through the current level children (above)
                                matchedStudentNodes.append(
                                    studentChild['child']['key'])
                            elif len(teachChildNodesList) < len(
                                    studChildNodesList):
                                matchedStudentNodes.append(
                                    studentChild['child']['key'])
                            else:
                                matchedStudentNodes.append(
                                    studentChild['child']['key'])

                            break

                if matchFound == 'false' and not teacherChild['child'][
                        'key'] in teachVisitedNodes:  # len(teachChildNodesList) == len(studChildNodesList) and
                    notMatchedParentTeacherNodes.append(
                        teacherChild['child']['key'])
                elif matchFound == 'false' and teacherChild['child'][
                        'key'] in teachVisitedNodes:
                    feedback = feedback + 'The block:' + teacherChild['child']['text'] + \
                               ' should be connected to block:' + teacherCurrentText[0]['node.text'] + '. '
                    totNoOfOtherIncorrectNodes = totNoOfOtherIncorrectNodes + 1

            if len(teachChildNodesList) == len(studChildNodesList) and len(
                    notMatchedParentTeacherNodes) == 1:

                print('^^^ONE SUBSTITUTED NODE')

                totNoOfSubstitutedNodes, totNoOfOtherIncorrectNodes, feedback = \
                    addTheOnlyUnmatchedNode('NotMatchedNode', graph, notMatchedParentTeacherNodes,
                                        teachStack, studChildNodesList, matchedStudentNodes,
                                        studStack, totNoOfSubstitutedNodes, feedback, studVisitedNodes,
                                        teachCurrent, studentCurrentText[0]['node.text'], totNoOfOtherIncorrectNodes)

            elif len(teachChildNodesList) == len(studChildNodesList) and len(
                    notMatchedParentTeacherNodes) > 1:

                totNoOfSubstitutedNodes = totNoOfSubstitutedNodes + len(
                    notMatchedParentTeacherNodes)

                againNotMatchedTeacherNodes, handledStudentNodeList, feedback = checkForCurrentNodeChildMatch(
                    'substitutedCaller', graph, matchedStudentNodes,
                    notMatchedParentTeacherNodes, studChildNodesList,
                    studVisitedNodes, studStack, teachStack, feedback,
                    studentCurrentText[0]['node.text'])

                if len(againNotMatchedTeacherNodes) == 1:
                    totNoOfOtherIncorrectNodes, feedback = addTheOnlyUnmatchedNode(
                        'NotMatchedChildrenNode', graph,
                        againNotMatchedTeacherNodes, teachStack,
                        studChildNodesList, handledStudentNodeList, studStack,
                        totNoOfSubstitutedNodes, feedback, studVisitedNodes,
                        teachCurrent, studentCurrentText[0]['node.text'],
                        totNoOfOtherIncorrectNodes)

                elif len(againNotMatchedTeacherNodes) > 1:
                    for studentChild in studChildNodesList:
                        if not studentChild['child'][
                                'key'] in handledStudentNodeList and not studentChild[
                                    'child']['key'] in studVisitedNodes:
                            feedback = feedback + 'The block:' + studentChild['child']['text'] + \
                                               ' connected to block:' + studentCurrentText[0]['node.text'] + ' is substituted, and it '

                            for againNotTeacherNode in againNotMatchedTeacherNodes:
                                teacherNodeText = graph.data(
                                    "MATCH (node:Teacher) WHERE node.key= {key} RETURN node.text",
                                    parameters={"key": againNotTeacherNode})

                                feedback = feedback + ' should be:' + teacherNodeText[
                                    0]['node.text'] + ' or'

                            feedback = feedback + ' one of the mentioned blocks. The immediate blocks that follow ' +\
                                       'this block:' + studentChild['child']['text'] + ' are also wrong. Please check them. '

                            substitutedNodes.append(
                                studentChild['child']['key'])

            # handles scenario where student graph has deleted child nodes for the current node under consideration
            if len(teachChildNodesList) > len(studChildNodesList):
                totNoOfDeletedNodes = totNoOfDeletedNodes + (
                    len(teachChildNodesList) - len(studChildNodesList))

                if len(matchedStudentNodes) == len(studChildNodesList):
                    for child in teachChildNodesList:
                        if not child['child'][
                                'key'] in matchedTeacherNodes and not child[
                                    'child']['key'] in teachVisitedNodes:
                            feedback = feedback + 'Missing Block:' + child['child']['text'] + \
                                               ' should be connected to block:' + studentCurrentText[0]['node.text'] + '. '
                            deletedNodes.append(child['child']['key'])
                elif len(matchedStudentNodes) < len(studChildNodesList):
                    feedback = feedback + 'There is/are ' + str(len(teachChildNodesList) - len(studChildNodesList)) + \
                               ' missing block(s) that should be connected to block:' + studentCurrentText[0]['node.text'] + \
                               ' and ' + str(len(studChildNodesList) - len(matchedStudentNodes)) + \
                               ' block(s) connected to block:' + studentCurrentText[0]['node.text'] + \
                               ' is/are substituted - The incorrect blocks are '

                    againNotMatchedTeacherNodes, handledStudentNodeList, feedback = checkForCurrentNodeChildMatch(
                        'deletedSubstitutedCaller', graph, matchedStudentNodes,
                        notMatchedParentTeacherNodes, studChildNodesList,
                        studVisitedNodes, studStack, teachStack, feedback,
                        studentCurrentText[0]['node.text'])

                    if len(handledStudentNodeList) == len(studChildNodesList):
                        for child in teachChildNodesList:
                            if child['child'][
                                    'key'] in againNotMatchedTeacherNodes and not child[
                                        'child']['key'] in teachVisitedNodes:
                                feedback = feedback + 'block:' + child['child']['text'] + \
                                           ' that should be connected to block:' + studentCurrentText[0]['node.text'] +\
                                           ' is missing and '
                                deletedNodes.append(child['child']['key'])

                    elif len(handledStudentNodeList) < len(studChildNodesList):
                        for child in teachChildNodesList:
                            if child['child'][
                                    'key'] in againNotMatchedTeacherNodes and not child[
                                        'child']['key'] in teachVisitedNodes:
                                feedback = feedback + ' block:' + child['child']['text'] + \
                                           ' that should be/is connected to block:' + studentCurrentText[0]['node.text'] + \
                                           ' is deleted/substituted and the immediate child blocks of this block are also wrong, please check them, and '

                                delOrSubNodes.append(child['child']['key'])

                    feedback = feedback + 'please check all these incorrect blocks. '

            # handles scenario where student graph has additional child nodes for the current node under consideration
            elif len(teachChildNodesList) < len(studChildNodesList):
                totNoOfAdditionalNodes = totNoOfAdditionalNodes + (
                    len(studChildNodesList) - len(teachChildNodesList))

                # handles scenario where all teacher nodes are matched and there are additional nodes
                if len(matchedStudentNodes) == len(teachChildNodesList):
                    for child in studChildNodesList:
                        if not child['child'][
                                'key'] in matchedStudentNodes and not child[
                                    'child']['key'] in studVisitedNodes:
                            feedback = feedback + 'Additional Block:' + child['child']['text'] +\
                                       ' is connected to block:' + studentCurrentText[0]['node.text'] + '. '
                            additionalNodes.append(child['child']['key'])
                        elif not child['child'][
                                'key'] in matchedStudentNodes and child[
                                    'child']['key'] in studVisitedNodes:
                            feedback = feedback + 'Additional connection from block:' + studentCurrentText[0]['node.text'] +\
                                       ' to block:' + child['child']['text'] + '. '
                elif len(matchedStudentNodes) < len(teachChildNodesList):
                    feedback = feedback + 'There is/are ' + str(len(studChildNodesList) - len(teachChildNodesList)) + \
                               ' additional block(s) connected to block:' + studentCurrentText[0]['node.text'] + ' and ' +\
                               str(len(teachChildNodesList) - len(matchedStudentNodes)) +\
                               ' block(s) connected to block:' + studentCurrentText[0]['node.text'] + ' is/are substituted - The incorrect blocks are '

                    againNotMatchedTeacherNodes, handledStudentNodeList, feedback = checkForCurrentNodeChildMatch(
                        'additionalSubstitutedCaller', graph,
                        matchedStudentNodes, notMatchedParentTeacherNodes,
                        studChildNodesList, studVisitedNodes, studStack,
                        teachStack, feedback,
                        studentCurrentText[0]['node.text'])

                    if len(handledStudentNodeList) == len(
                            teachChildNodesList
                    ):  # len(againNotMatchedTeacherNodes) == (len(studChildNodesList)-len(teachChildNodesList))
                        for child in studChildNodesList:
                            if not child['child'][
                                    'key'] in handledStudentNodeList and not child[
                                        'child']['key'] in studVisitedNodes:
                                feedback = feedback + 'block:' + child['child']['text'] + ' connected to block:' +\
                                           studentCurrentText[0]['node.text'] + ' is additional and '
                                additionalNodes.append(child['child']['key'])

                    elif len(handledStudentNodeList) < len(
                            teachChildNodesList
                    ):  # len(againNotMatchedTeacherNodes) > (len(studChildNodesList)-len(teachChildNodesList))
                        for child in studChildNodesList:
                            if not child['child'][
                                    'key'] in handledStudentNodeList and not child[
                                        'child']['key'] in studVisitedNodes:
                                feedback = feedback + ' block: ' + child['child']['text'] + ' connected to block:' +\
                                           studentCurrentText[0]['node.text'] +\
                                ' is additional/substituted and the immediate child blocks of this block are also wrong, please check them, and '

                                addOrSubNodes.append(child['child']['key'])

                    feedback = feedback + 'please check all these incorrect blocks. '

            matchedTeacherNodes = []
            matchedStudentNodes = []

            notMatchedParentTeacherNodes = []

            teachVisitedNodes.append(teachCurrent)
            studVisitedNodes.append(studCurrent)

        elif studStack and not teachStack:
            print('^^^^^^^^^^^^^^^STUDENT stack has moreeee.....')
            break

    # handles additional nodes down an additional node starting path
    if additionalNodes:
        feedback, totNoOfAdditionalNodes = detectUndetectedBlocks(
            "additionalNodes", graph, additionalNodes, studVisitedNodes,
            feedback, totNoOfAdditionalNodes)

    # handles deleted nodes down a deleted node starting path
    if deletedNodes:
        feedback, totNoOfDeletedNodes = detectUndetectedBlocks(
            "deletedNodes", graph, deletedNodes, teachVisitedNodes, feedback,
            totNoOfDeletedNodes)

    # handles substituted nodes down a substituted node starting path
    if substitutedNodes:
        feedback, totNoOfOtherSubstitutedNodes = detectUndetectedBlocks(
            "substitutedNodes", graph, substitutedNodes, studVisitedNodes,
            feedback, totNoOfOtherSubstitutedNodes)

    # handles additional/substituted nodes down a additional/substituted node starting path
    if addOrSubNodes:
        feedback, totNoOfOtherIncorrectNodes = detectUndetectedBlocks(
            "addOrSubNodes", graph, addOrSubNodes, studVisitedNodes, feedback,
            totNoOfOtherIncorrectNodes)

    # handles deleted/substituted nodes down a deleted/substituted node starting path
    if delOrSubNodes:
        feedback, totNoOfOtherIncorrectNodes = detectUndetectedBlocks(
            "delOrSubNodes", graph, delOrSubNodes, teachVisitedNodes, feedback,
            totNoOfOtherIncorrectNodes)




    if totNoOfAdditionalNodes == 0 and totNoOfDeletedNodes == 0 and totNoOfSubstitutedNodes == 0 and \
            totNoOfOtherSubstitutedNodes == 0 and totNoOfOtherIncorrectNodes == 0:
        print(totNoOfMatchedNodes)
        feedback = feedback + "Excellent Job! All the blocks and the flow are correct!"  # Number of correct blocks: " + ". "
        print(feedback)
    else:
        feedback = feedback + "Number of correct blocks except start and end blocks: " + str(
            totNoOfMatchedNodes) + ". "
        print(feedback)

    allocateMarksAndSaveToDatabase(totNoOfMatchedNodes, totNoOfAdditionalNodes,
                                   totNoOfDeletedNodes,
                                   totNoOfSubstitutedNodes,
                                   totNoOfOtherSubstitutedNodes,
                                   totNoOfOtherIncorrectNodes, feedback,
                                   processQuestionId, studentAnswerId)
Пример #10
0
# In[20]:

# transforming the "budget" column into string and creating a new **mixture** column
ssj.utils.converter.dataframe_column_to_str(imdb_data, 'budget', inplace=True)
imdb_data['mixture'] = imdb_data['norm_title'] + ' ' + imdb_data['norm_year'] + ' ' + imdb_data['budget']

# repeating the same thing for the Kaggle dataset
ssj.utils.converter.dataframe_column_to_str(kaggle_data, 'budget', inplace=True)
kaggle_data['mixture'] = kaggle_data['norm_movie_title'] + ' ' + kaggle_data['norm_title_year'] +                          ' ' + kaggle_data['budget']


# Now, we can use the **mixture** columns to create a desired candiate set which we call **C**.

# In[21]:

C = ssj.overlap_coefficient_join(kaggle_data, imdb_data, 'id', 'id', 'mixture', 'mixture', sm.WhitespaceTokenizer(), 
                                 l_out_attrs=['norm_movie_title', 'norm_title_year', 'duration',
                                              'budget', 'content_rating'],
                                 r_out_attrs=['norm_title', 'norm_year', 'length', 'budget', 'mpaa'],
                                 threshold=0.65)
C.shape


# We can see that by doing a similarity join, we already reduced the candidate set to 18,317 pairs.
# 
# #### Substep B: Specifying the keys 
# The next step is to specify to the **py_entitymatching** package which columns correspond to the keys in each dataframe. Also, we need to specify which columns correspond to the foreign keys of the the two dataframes in the candidate set.

# In[22]:

import py_entitymatching as em
Пример #11
0
def tok_wspace(input_string):
    if pd.isnull(input_string):
        return pd.np.NaN

    measure = sm.WhitespaceTokenizer()
    return measure.tokenize(input_string)
Пример #12
0
                                            inplace=True)
kaggle_data['mixture'] = kaggle_data['norm_movie_title'] + ' ' + kaggle_data[
    'norm_title_year'] + ' ' + kaggle_data['budget']

# Now, we can use the **mixture** columns to create a desired candiate set which we call **C**.

# In[21]:

C = ssj.overlap_coefficient_join(
    kaggle_data,
    imdb_data,
    'id',
    'id',
    'mixture',
    'mixture',
    sm.WhitespaceTokenizer(),
    l_out_attrs=[
        'norm_movie_title', 'norm_title_year', 'duration', 'budget',
        'content_rating'
    ],
    r_out_attrs=['norm_title', 'norm_year', 'length', 'budget', 'mpaa'],
    threshold=0.65)
C.shape

# We can see that by doing a similarity join, we already reduced the candidate set to 18,317 pairs.
#
# #### Substep B: Specifying the keys
# The next step is to specify to the **py_entitymatching** package which columns correspond to the keys in each dataframe. Also, we need to specify which columns correspond to the foreign keys of the the two dataframes in the candidate set.

# In[22]:
ivivus['id'] = range(ivivus.shape[0])

# Edit distance
#similar_titles = ssj.edit_distance_join(hotels, mytour , 'id', 'id', 'norm_name','norm_name', l_out_attrs=['link','norm_name','norm_address','rooms', 'norm_star', 'benefits','norm_image'], r_out_attrs=['link','norm_name','norm_address','rooms', 'norm_star', 'benefits','norm_image'], threshold=2)


# A: Finding a candidate set using simple heuristic
# create mixture column
ssj.utils.converter.dataframe_column_to_str(booking, 'norm_star', inplace=True)
booking['mixture'] = booking['norm_name'] + ' ' + booking['norm_address']

# repeating the same thing for the kaggle db
#ssj.utils.converter.dataframe_column_to_str(hotels, 'norm_star', inplace=True)
hotels['mixture'] = hotels['norm_name'] + ' ' + hotels['norm_address'] 

C = ssj.overlap_coefficient_join(hotels, booking, 'id','id', 'mixture', 'mixture', sm.WhitespaceTokenizer(), 
                                 l_out_attrs=['link','norm_name','name','norm_address','head_address','rooms', 'norm_star', 'benefits','norm_image','rating','destination'], 
                                 r_out_attrs=['link','norm_name','name','norm_address','head_address','rooms', 'norm_star', 'benefits','norm_image','rating','destination'], 
                                 threshold=0.7)
print(C.shape)


# Creating the Rule-Based Matcher
import py_entitymatching as em
em.set_key(hotels, 'id')
em.set_key(booking, 'id')
em.set_key(C, '_id')
em.set_ltable(C, hotels)
em.set_rtable(C, booking)
em.set_fk_rtable(C, 'r_id')
em.set_fk_ltable(C, 'l_id')
Пример #14
0
    '1gram_set': sm.QgramTokenizer(qval=1, return_set=True),
    '1grams_set': sm.QgramTokenizer(qval=1, return_set=True),
    '2grams_set': sm.QgramTokenizer(qval=2, return_set=True),
    '3grams_set': sm.QgramTokenizer(qval=3, return_set=True),
    '4grams_set': sm.QgramTokenizer(qval=4, return_set=True),
    '5grams_set': sm.QgramTokenizer(qval=5, return_set=True),
    '6grams_set': sm.QgramTokenizer(qval=6, return_set=True),
    '7grams_set': sm.QgramTokenizer(qval=7, return_set=True),
    '8grams_set': sm.QgramTokenizer(qval=8, return_set=True),
    '9grams_set': sm.QgramTokenizer(qval=9, return_set=True),

    # Word tokenizers
    'alphanumeric': sm.AlphanumericTokenizer(),
    'alphanum': sm.AlphanumericTokenizer(),
    'alphabetic': sm.AlphabeticTokenizer(),
    'whitespace': sm.WhitespaceTokenizer(),
    'alphanumeric_set': sm.AlphanumericTokenizer(return_set=True),
    'alphanum_set': sm.AlphanumericTokenizer(return_set=True),
    'alphabetic_set': sm.AlphabeticTokenizer(return_set=True),
    'whitespace_set': sm.WhitespaceTokenizer(return_set=True),
}

cleaner_lookup = {
    'lower_and_strip': lower_and_strip,
    'alphanumeric': clean_to_alphanum,
    'alphanum': clean_to_alphanum,
}


def get_similarity_measure(measure, **kwargs):
    if isinstance(measure, str):