예제 #1
0
def venn_intersect_text_tag(df_a, column_name, set_b, column_name_b, list_no):
    """ Precondition is to have been tokenized
      per venn diagram illustration of intersect of a set A and set B
      This function does WILL affect the dataframe that was pass by reference
    This function will modify the both dataframe to keep words found in both set A and set B, hence intersect"""
    df_a = sort.reindex(df_a)
    set_b = sort.reindex(set_b)

    match = []
    for row in df_a[column_name]:
        for word in row:
            for rowB in set_b[column_name_b]:
                for wordB in rowB:
                    if word[:][
                            list_no] not in match:  # if each element in setA cannot be found in the match list
                        if word[:][list_no] in wordB[:][
                                list_no]:  # then we use it to check if its in each element of
                            # SetB (the whole dataframe)
                            match.append(
                                word[:]
                                [list_no])  # if found append into match list
            if word[:][
                    list_no] not in match:  # if the element cant be find in match
                # remove element that are not found in match list
                del word[:]
        # end of setA
    for rowB in set_b[column_name_b]:  # start removing non match from setB
        for wordB in rowB:
            if wordB[:][
                    list_no] not in match:  # if the element cant be find in match
                # remove element that are not found in match list
                del wordB[:]
    return df_a, set_b
예제 #2
0
def assoc_term_attached(df, column_name, term_struct):
    """
       a filter function to keep only rows in dataframe where it must contains all elements within termStruct,
           in sequence of the list termStruct.
       termStruct refers to the user input of which results in a list
            eg. [ [1st value, text or tag] , [2nd value, text or tag] ]
       """
    df = sort.reindex(df)
    i = 0
    for row in df[column_name]:
        term_length = len(term_struct)
        t = 0
        for word in df.iloc[i][column_name]:
            if t >= term_length:  # when correct number of matches
                break
            if term_struct[t][1] == 'text':
                list_no = 0
            else:
                list_no = 1
            if term_struct[t][0] in word[
                    list_no]:  # if value match remove from list of struct to track
                t += 1
            else:
                t = 0  # if failed to match start again for tempStruct
        if t != term_length:
            df.iloc[i][column_name] = []
        i += 1
    return df
예제 #3
0
def venn_union(df_a, df_b):
    """precondition both dataframe setA and setB must have the same columns
    this function will join merge 2 dataframe setA and setB as result dataframe and return
        as a single dataframe
    """
    frames = [df_a, df_b]
    result = pd.concat(frames)
    result = sort.reindex(
        result)  # reindex as it will keep the old index from both sets
    return result
예제 #4
0
def spacy_clean_cell(df, column_name):
    """ After working with some other functions,
    there might be cells in columnName that contains empty list.
    This function is to clean up such list"""
    df = sort.reindex(df)
    i = 0
    # for row in df[columnName]:
    total_row = len(df[column_name])
    while i < total_row:
        for word in df[column_name].loc[i]:
            df[column_name].loc[i] = [x for x in df[column_name].loc[i] if x]
        if len(df[column_name].loc[i]) == 0:  # double check if work
            df.drop(i, inplace=True)
        i += 1
    return df