Пример #1
0
def lines_in_files(path):
    all_files = csv_access.list_files_in_directory(path)
    total = 0
    for f in all_files:
        with open(f, "r") as g:
            lines = g.readlines()
            total += len(lines)
    print("Total lines: " + str(total))
Пример #2
0
def get_spo_from_uns(path="/data/smalldatasets/clean_triple_relations/",
                     loc_dic=None):
    all_files = csv_access.list_files_in_directory(path)

    if loc_dic is not None:
        # retrieve max index
        loc_idx = max(list(loc_dic.values())) + 1  # the next one

    spos = []
    for fname in all_files:
        if loc_dic is not None:
            loc_dic[fname] = loc_idx
            loc_idx += 1
        df = pd.read_csv(fname, encoding='latin1')
        for index, row in df.iterrows():
            s, p, o = row['s'], row['p'], row['o']
            s = nlp_utils.filter(s)
            p = nlp_utils.filter(p)
            o = nlp_utils.filter(o)
            if s == "" or p == "" or o == "":
                continue
            spo = (s, p, o)
            spos.append(spo)

    print("Original data: " + str(len(spos)))
    # Identify contained elements and remove those facts
    to_remove = set()

    for idx in range(len(spos)):
        if idx in to_remove:
            continue
        s, p, o = spos[idx]
        fact = s + " " + p + " " + o
        ftok = fact.split(" ")
        for jdx in range(len(spos)):
            if jdx in to_remove:
                continue
            if jdx == idx:
                continue
            sj, pj, oj = spos[jdx]
            alt_fact = sj + " " + pj + " " + oj
            aftok = alt_fact.split(" ")
            if len(set(ftok) - set(aftok)) == 0:
                to_remove.add(idx)

    # filter out non-answer bits as well as indexes deemed to remove
    spos = [el for idx, el in enumerate(spos) if idx not in to_remove]
    # for el in data:
    #      print(el)

    print("Proc data: " + str(len(spos)))

    return spos, loc_dic
Пример #3
0
def statistics_words(path):
    all_files = csv_access.list_files_in_directory(path)
    total_tokens = 0
    total_lines = 0
    for f in all_files:
        with open(f, "r") as g:
            lines = g.readlines()
            total_lines += len(lines)
            for l in lines:
                tokens = l.split(" ")
                total_tokens += len(tokens)
    print("Total tokens: " + str(total_tokens))
    print("Avg tokens: " + str(float(total_tokens/total_lines)))
Пример #4
0
def extract_data_from_directory(path="/Users/ra-mit/data/mitdwhdata/",
                                rows_per_relation=5):
    all_files = csv_access.list_files_in_directory(path)
    pairs = []
    for filename in all_files:
        name = filename.split("/")[-1].split(".")[0]
        df = pd.read_csv(filename, encoding='latin1')
        columns = df.columns
        for c in columns:
            pair = (name, c, 0)
            pairs.append(pair)  # filename - cols
        current_rows = 0
        for idx, row in df.iterrows():
            current_rows += 1
            # # filter cols based on type
            # valid_type_columns = []
            # for c in columns:
            #     if df[c].dtype == string:
            #         valid_type_columns.append(c)
            # columns = valid_type_columns
            for c in columns:
                if re.search('[0-9]', str(row[c])) is not None:
                    continue
                if pd.isnull(row[c]) or pd.isnull(row[c]):
                    continue
                if str(row[c]) == 'nan':
                    continue
                pair = (c, row[c], 0)
                pairs.append(pair)  # cols - colvalues
            colref = columns[0]
            for c1 in columns:
                if re.search('[0-9]', str(
                        row[colref])) is not None or re.search(
                            '[0-9]', str(row[c1])) is not None:
                    continue
                if pd.isnull(row[colref]) or pd.isnull(row[c1]):
                    continue
                # if type(row[colref]) == float and np.isnan(row[colref]):
                #     continue
                # if type(row[c1]) == float and np.isnan(row[c1]):
                #     continue
                if str(row[colref]) == 'nan' or str(row[c1]) == 'nan':
                    continue
                pair = (row[colref], row[c1], 0)
                pairs.append(pair)
            if current_rows > rows_per_relation:
                break  # go to next file
    return pairs
Пример #5
0
def get_location_dictionary(path):
    files = csv_access.list_files_in_directory(path)
    return get_location_dictionary_from_files(files)
Пример #6
0
def get_sqa(path="/data/smalldatasets/clean_triple_relations/",
            filter_stopwords=False,
            loc_dic=None):

    all_files = csv_access.list_files_in_directory(path)
    data = []

    if loc_dic is not None:
        # retrieve max index
        loc_idx = max(list(loc_dic.values())) + 1  # the next one

    for fname in all_files:
        if loc_dic is not None:
            loc_dic[fname] = loc_idx
            loc_idx += 1
        df = pd.read_csv(fname, encoding='latin1')
        for index, row in df.iterrows():
            s, p, o = row['s'], row['p'], row['o']
            s = nlp_utils.filter(s)
            p = nlp_utils.filter(p)
            o = nlp_utils.filter(o)
            if s == "" or p == "" or o == "":
                continue
            this_support = s.split(" ") + p.split(" ") + o.split(" ")
            this_question = s.split(" ") + p.split(" ")
            this_answer = o.split(" ")
            this_question2 = p.split(" ") + o.split(" ")
            this_answer2 = s.split(" ")
            # if filter_stopwords:
            #     this_support = [w for w in this_support if w not in english]
            #     this_question = [w for w in this_question if w not in english]
            #     this_question2 = [w for w in this_question2 if w not in english]
            #     this_answer = [w for w in this_answer if w not in english]
            #     this_answer2 = [w for w in this_answer2 if w not in english]

            #if len(this_support) != 0 and len(this_question) != 0 and len(this_answer) != 0:
            el1 = this_support, this_question, this_answer
            # print(el1)
            data.append(el1)
            #if len(this_support) != 0 and len(this_question2) != 0 and len(this_answer2) != 0:
            el2 = this_support, this_question2, this_answer2
            # print(el2)
            data.append(el2)

    print("Original data: " + str(len(data)))
    # Identify contained elements and remove those facts
    to_remove = set()
    for idx in range(0, len(data), 2):
        fact, _, _ = data[idx]
        for jdx in range(0, len(data), 2):
            if jdx == idx:
                continue
            alt_fact, _, _ = data[jdx]
            if len(set(fact) - set(alt_fact)) == 0:
                to_remove.add(idx)
                to_remove.add(idx + 1)  # there are pairs of them
                #break  # move on to next fact

    # filter out non-answer bits as well as indexes deemed to remove
    data = [
        el for idx, el in enumerate(data)
        if idx not in to_remove and len(el[0]) > 2
    ]
    # for el in data:
    #      print(el)

    print("Proc data: " + str(len(data)))

    return data, loc_dic
Пример #7
0
                total_tokens += len(tokens)
    print("Total tokens: " + str(total_tokens))
    print("Avg tokens: " + str(float(total_tokens/total_lines)))

if __name__ == "__main__":
    print("Process relations")

    statistics_words("/Users/ra-mit/data/fabric/academic/clean_relations/")
    # exit()

    path = "/Users/ra-mit/data/fabric/academic/relations/"
    out_path = "/Users/ra-mit/data/fabric/academic/clean_relations/"

    pronouns = ["She", "He", "she", "he"]

    all_files = csv_access.list_files_in_directory(path)

    #all_files = [all_files[0]]

    for fpath in all_files:
        name = (fpath.split("/")[-1]).split(".")[0]

        pre_processed_tokens = []
        with open(fpath, "r") as f:
            relations = f.readlines()
            for r in relations:
                tokens = r.split(" ")[1::]  # remove number
                pre_tokens = tp.tokenize(" ".join(tokens), " ")  # clean stuff
                pre_tokens = [el.strip() for el in pre_tokens]
                # change pronouns by names
                for idx in range(len(pre_tokens)):
Пример #8
0
def main(argv):
    ifiles = ""
    ofile = ""
    training_data_path = ""
    term_dictionary_path = ""
    sparsity_code_size = 16
    try:
        opts, args = getopt.getopt(argv, "hvi:o:t:d:s:")
    except getopt.GetoptError:
        print("use it correctly ")
        sys.exit(2)

    for opt, arg in opts:
        if opt == "-h":
            print("wrong!")
            sys.exit()
        elif opt in "-i":
            ifiles = arg
        elif opt in "-o":
            ofile = arg
        elif opt in "-t":
            training_data_path = arg
        elif opt in "-d":
            term_dictionary_path = arg
        elif opt in "-s":
            sparsity_code_size = int(arg)

    # load existing dic
    term_dic = None
    with open(term_dictionary_path, "rb") as f:
        term_dic = pickle.load(f)

    # create vectorizer
    idx_vectorizer = IndexVectorizer(vocab_index=term_dic,
                                     sparsity_code_size=sparsity_code_size)
    vectorizer = tp.CustomVectorizer(idx_vectorizer)

    # reload existing training data into new file
    with gzip.open(ofile + "/pairsdata/training_data.pklz", "wb") as g:
        with gzip.open(training_data_path, "rb") as f:
            try:
                while True:
                    x1, x2, y = pickle.load(f)
                    pickle.dump((x1, x2, y), g)
            except EOFError:
                print("rewritten")
        # read all unstructured files
        all_files = csv_access.list_files_in_directory(ifiles)
        #offset = 1
        for i in range(len(all_files)):
            ifile = all_files[i]
            # get positive pairs from ifile
            for x1, x2 in gen_pos_pairs(ifile, vectorizer):
                pickle.dump((x1, x2, 0), g)
            # gen negative pairs from all the jfiles
            #for jfile in all_files[offset::]:
            for jfile in all_files:
                #jfile = all_files[j]
                if ifile == jfile:
                    continue
                for x1, x2 in gen_neg_pairs(ifile, jfile,
                                            vectorizer):  # neg from i to j
                    pickle.dump((x1, x2, 1), g)
            #offset += 1  # advance offset to not repeat negative pairs

    with gzip.open(ofile + "/training_data.pklz", "rb") as f:
        with gzip.open(ofile + "/baedata/training_data.pklz", "wb") as g:
            try:
                while True:
                    x1, x2, y = pickle.load(f)
                    pickle.dump((x1, y), g)
                    pickle.dump((x2, y), g)
            except EOFError:
                print("rewritten")

    vocab, inv_vocab = vectorizer.get_vocab_dictionaries()

    with open(term_dictionary_path, "wb") as f:
        pickle.dump(vocab, f)

    print("Done!")