Exemplo n.º 1
0
def compare_all(fin1, fdin2):
    # read input data
    if fin1 is None or fin1 =="":
        return False
    texts = ufile.read_csv(fin1) # a specific file or a directory
    # read input data
    if fdin2 is None or fdin2 =="":
        return False
    EDBlist = ufile.load_files (fdin2) # a specific file or a directory
    result = []
    cur = 0
    for text in texts:
        cur += 1
        result_items_new =[]
        result_items = ast.literal_eval(text[2])
        #print result_items
        for result_item in result_items:
            #print result_item[0] in EDBlist
            if result_item[0] in EDBlist:
                result_items_new.append(result_item)
        result.append((text[0], text[1], str(result_items_new)))
       
    # output result
    fout = os.path.splitext(fin1)[0] + "_EDB.csv"
    ufile.write_csv(fout, result)
    print 'saved result into: %s' % fout
    
    print ext_print ('all tasks completed\n')
    return True
Exemplo n.º 2
0
def extract_variables (fdin, ffea, ffea2, var, cores):
    # read input dataset
    if fdin is None or fdin =="": return False
    trials = ufile.read_csv (fdin)
    if trials is None or len(trials) <= 0:
        print ext_print ('input data error, please check either no such file or no data --- interrupting')
        return False
    print ext_print ('found a total of %d data items' % len(trials))
    
    # read feature list - domain knowledge
    if ffea is None or ffea =="": return False
    fea_dict_dk = ufile.read_csv_as_dict_with_multiple_items (ffea)
    if fea_dict_dk is None or len(fea_dict_dk) <= 0:
        print ext_print ('no feature data available --- interrupting')
        return False

    # get feature info
    features, feature_dict_dk = {}, {}
    if var == "All":
        features = fea_dict_dk
        del features["Variable name"]
    elif var in fea_dict_dk:
        features = {var: fea_dict_dk[var]}
    for key, value in fea_dict_dk.iteritems():
        names = value[0].lower().split('|')
        for name in names:
            if name.strip() != '': feature_dict_dk[name.strip()] = key

    # read feature list - umls
    if ffea2 is None or ffea2 =="": return False
    fea_dict_umls = ufile.read_csv_as_dict (ffea2)
    if fea_dict_umls is None or len(fea_dict_umls) <= 0:
        print ext_print ('no feature data available --- interrupting')
        return False


    output = Manager().list()
    jobs = []
    for i in xrange(1,cores+1):
        t = Process(target=worker, args=(trials, len(trials)*(i-1)/cores,len(trials)*i/cores-1, var, features, feature_dict_dk, fea_dict_umls, output))
        jobs.append(t)
        t.start()    
    for j in jobs: j.join()

    fout = os.path.splitext(fdin)[0] + "_exp_%s.csv" % var
    ufile.write_csv (fout, output)
    print ext_print ('saved processed results into: %s' % fout)
    return True
Exemplo n.º 3
0
def compare_all(fin1):
    # read input data
    if fin1 is None or fin1 == "":
        return False
    texts = ufile.read_csv(fin1)  # a specific file or a directory
    result = []
    start_time = time.time()
    cur = 0
    for text in texts:
        simi_valuesList = []
        cur += 1
        if len(text[1].split('.')) > 1:
            target_word, pos = text[1].split('.')[0], text[1].split('.')[1]
        else:
            target_word, pos = text[1], None
        print "%d of %d" % (cur, len(texts)), target_word
        candidatewords = text[2]
        candidatewords = ast.literal_eval(candidatewords)
        simi_values = []
        for candidate in candidatewords:
            #print "candidate:"
            #print candidate
            word2 = candidate[0]
            # print word2
            try:
                simi_values = gensim_model.similarity(target_word, word2)
            except KeyError:
                simi_values = 0
            # word_sim[word2] = round(float(simi_values), 5)
            simi_valuesList.append((word2, round(float(simi_values), 5)))
        simi_valuesList.sort(key=operator.itemgetter(1),
                             reverse=True)  # sort by rank value
        print "simi_valuesList:"
        print simi_valuesList[:30]
        result.append((text[0], text[1], simi_valuesList[:30]))
        print result
    print("--- %s seconds ---" % (time.time() - start_time))
    fout = os.path.splitext(fin1)[0] + "_rank.csv"
    ufile.write_csv(fout, result)
    print 'saved result into: %s' % fout
    print ext_print('all tasks completed\n')
    return True
Exemplo n.º 4
0
def CT_extractxml (fin, fout=None):
	processed_list = [] # set processed trials into here to avoid redundency
	for root, dir, files in os.walk(fin):
		for f in files:
			if not f.endswith(".xml") or f in processed_list:
				continue
			print ext_print (f)
			processed_list.append(f)
			if len(processed_list)%1000 == 0:
				print ('Processing  %d' % len(processed_list))

			output = []
			# read input data
			fdin = os.path.join(root, f)
			text = ufile.read_file (fdin, 3, False)
			if text is not None:
				ct_xml = xml_parser.fromstring(text)
				blocks = ct_xml.findall('MedlineCitation')
				ct_xml = ""
				for block in blocks:
					(PMID, JournalTitle, PubDate, ArticleTitle, Abstract, Keywords) =  extract_component(block)
					output.append((PMID, JournalTitle, PubDate, ArticleTitle, Abstract, Keywords))
				blocks = []

			# set output data file
			fout = os.path.splitext(fdin)[0] + "_extracted.csv"

			ufile.write_csv (fout, output)
			print ext_print ('saved result in: %s' % fout)

	print ext_print ('all tasks completed\n')
	return True
Exemplo n.º 5
0
def file_merge(fdin, fout, columns, format):
    # read input data
    if fdin is None or fdin == "":
        return False
    texts = ufile.read(fdin)  # a specific file or a directory
    result = []
    print texts
    if columns == "all":
        result = texts
    else:
        cols = columns.split('|')
        for text in texts:
            if len(cols) == 1:
                result.append(text[int(cols) - 1])
            else:
                for col in cols:
                    result.append(text[int(col) - 1])

    print ext_print('get %d in total' % len(result))

    # get output data directory
    if fout is None:
        fout = os.path.splitext(fdin)[0] + "_merged" + format
    # output detailed result into file
    if format == "" or ".txt":
        ufile.write_file(fout, result, False)
    elif format == ".csv":
        ufile.write_csv(fout, result)
    print ext_print('saved result into: %s' % fout)
    print ext_print('all tasks completed\n')
    return True
Exemplo n.º 6
0
def Insert_DB(fin):
    for root, dir, files in os.walk(fin):
        for f in files:
            if not f.endswith(".csv"):
                continue
            print ext_print(f)

            output = []
            # read input data
            fdin = os.path.join(root, f)
            rows = ufile.read_csv(fdin)
            for row in rows:
                # param = (PMID, JournalTitle, PubDate, ArticleTitle, Abstract, Keywords)
                PubDate = row[2]

                if PubDate != '2009':  # To split tasks into differnt machines with MySQL, process tables separatly
                    continue
                table = "article_" + PubDate
                #
                # if (row[2] == '' or row[2] is None): PubDate = 0
                # PubDate = int(PubDate)
                # table = 'article_0-1950'
                # if 2000 >= PubDate >= 1951:
                #     table = 'article_1951-2000'
                # if 2005 >= PubDate >= 2001:
                #     table = 'article_2001-2005'
                # elif PubDate > 2005:
                #     table = 'article_'+ str(PubDate)
				
                param = (row[0], row[1], PubDate, row[3], row[4], row[5])
                sql = "INSERT INTO `" + table + "` (`PMID`, `JournalTitle`, `PubDate`, `ArticleTitle`, `Abstract`, `Keywords`) VALUES(%s, %s, %s, %s, %s, %s);"
                msg = db.execute(sql, param)
                if msg != 1: print msg

    print ext_print('all tasks completed\n')
    return True
Exemplo n.º 7
0
def compare_all(fin1, fin2, n, fin3, fout1=None):

    # read Google 1T corpus
    print ext_print("start to load Google 1T grams")

    # load gram data
    Goole_grams, max_fre = {}, 0
    if fin2 is None or fin2 == "":
        return False
    fid = open(fin2, 'r')
    for line in fid:
        line = line.strip()
        if len(line) > 0:
            max_fre += 1
            if max_fre % 1000000 == 0:
                print max_fre
            tem = line.split('\t')
            if len(tem) <= 1:
                print ext_print("Data error! please check!" + str(tem))
    fid.close()
    print ext_print("all files loaded" + str(max_fre))
    return True
Exemplo n.º 8
0
def temporal_processing(fin,
                        fout=None,
                        type="testing",
                        fin_t=None,
                        rep_enable=False,
                        rep_word="",
                        event=False,
                        X3=False):

    # read the input data
    if (fin) is None:
        print ext_print('no input file found --- interrupting')
        return
    texts = ufile.read_file(fin, 1, False)
    if texts is None or len(texts) <= 0:
        print ext_print('no text available for processing --- interrupting')
        return

    print ext_print('start to process temporal information in text file %s' %
                    fin)

    if type == "training":
        tpatts = temporal_training(texts)

        # output pattern result
        if (fout is None) or (fout == ""):
            fout = os.path.splitext(fin)[0] + "_pat" + os.path.splitext(fin)[1]

        ufile.write_file(fout, sorted(tpatts, key=tpatts.get, reverse=True),
                         False)
        print ext_print('saved trained patterns into: %s' % fout)

    elif type == "testing":
        # read the pattern data
        if (fin_t) is None:
            print ext_print('no pattern file found --- interrupting')
            return
        tpatts = ufile.read_file(fin_t, 1, False)
        if tpatts is None or len(tpatts) <= 0:
            print ext_print(
                'no patterns available for processing --- interrupting')
            return

        result = temporal_testing(texts, tpatts, rep_enable, rep_word, event)
        if X3:
            result = using_TimeX3(result)

        # output result
        if (fout is None) or (fout == ""):
            if X3:
                fout = os.path.splitext(fin)[0] + "_TEXer.xml"
            else:
                fout = os.path.splitext(fin)[0] + "_TEXer" + os.path.splitext(
                    fin)[1]

        ufile.write_file(fout, result, False)
        print ext_print('saved processed results into: %s' % fout)

    print ext_print('all tasks completed\n')
    return True
def compare_all(fin1, fin2, n, fin3, fout1=None):
    # Referee related words of target words to reduce the size of loaded file into memory
    if fin1 is None or fin1 == "":
        return False
    orig_texts = ufile.read_csv(fin1)  # a specific file or a directory
    print orig_texts
    Related_words = {}
    for text in orig_texts:
        target_word = text[2].split('.')[0]
        words = text[1].lower().split()  # sentence words
        if (target_word in words) and len(words) > 1:
            temp_ngrams = find_ngrams(
                words, n)  # get all of sentence ngram candidates
            for ngram in temp_ngrams:
                if target_word in ngram:  # get target_word`s candidates
                    for te in ngram:
                        if te != target_word:
                            Related_words[te] = 1  # key(te)value=1
    print ext_print("Identified all related words")

    # Referee candidate words to reduce the size of loaded file into memory
    if fin3 is None or fin3 == "":
        return False
    candidate_words = {}
    for fin3_each in fin3.split(";"):
        test_data = ufile.read_csv(fin3_each)  # a specific file or a directory
        for i in range(len(test_data)):
            can_words = ast.literal_eval(
                test_data[i][2])  # parse string to array
            for can_word in can_words:
                if can_word[0] not in candidate_words:
                    candidate_words[can_word[0]] = 1
    print ext_print("Identified all candidate words")

    # read Google 1T corpus
    print ext_print("start to load Google 1T grams")
    Goole_grams, count, max_fre, c1, c2 = {}, 0, 0, 0, 0
    if fin2 is None or fin2 == "":
        return False
    fid = open(fin2, 'r')
    for line in fid:
        line = line.lower()
        count += 1
        if count % 10000000 == 0:
            print count
        if len(line) > 0:
            tem = line.split('\t')
            '''if len(tem) > 1:
                if tem[0] not in Goole_grams:
                    Goole_grams[tem[0]] = tem[1]
                    if long(tem[1]) > max_fre:  # reduce ordering calculations
                        max_fre = long(tem[1])'''
            if len(tem) == 1:
                c1 += 1
            if len(tem) > 1:
                c2 += 1
                temws = tem[0].split()
                find_candidate, find_related = False, False  # reduce memory usage
                for temw in temws:
                    if temw in candidate_words:
                        find_candidate = True
                    elif temw in Related_words:
                        find_related = True
                if find_candidate and find_related:
                    Goole_grams[tem[0]] = tem[1]
                    if long(tem[1]) > max_fre:  # reduce ordering calculations
                        max_fre = long(tem[1])

    fid.close()
    print count
    print("c1=%d,c2=%d" % (c1, c2))
    print ext_print("all files loaded")
    #     max_fre = max(map(float, Goole_grams.values())) # reduce memory usage
    if max_fre == 0:
        print ext_print("Data error! please check!")
        return
    else:
        print ext_print("Total number is %d" % len(Goole_grams))
    lemmatizer = WordNetLemmatizer()

    #betas = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
    # betas = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35]
    betas = [0.5]
    m, t, p = 0, 0, 0
    for beta in betas:
        # read candidate words
        for fin3_each in fin3.split(";"):
            candidate_words = ufile.read_csv(
                fin3_each)  # a specific file or a directory

            ranked_result = []
            for i in xrange(len(orig_texts)):
                text = orig_texts[i]
                can_words = ast.literal_eval(
                    candidate_words[i][2])  # parse string to array
                words = text[1].lower().split()
                target_word = text[2].split('.')[0]
                # print target_word
                if (target_word in words) and len(words) > 1:
                    candiate_ngrams, temp_ngrams = [], find_ngrams(
                        words, n)  # get ngram candidates
                    for ngram in temp_ngrams:
                        if target_word in ngram:
                            candiate_ngrams.append(
                                (ngram, ngram.index(target_word)))
                    ranks = {}
                    for can_word in can_words:
                        can_word, can_word_value, fre_can_word, max_context = can_word[
                            0], float(
                                can_word[1]
                            ), 0.0, 0.0  # can_word is candidate_word,can_word[0] is delete value just key
                        lemma_can_word = lemmatizer.lemmatize(can_word)
                        for (
                                ngram, k
                        ) in candiate_ngrams:  # k is the site of target_word
                            lst = list(ngram)
                            le_lst = list(ngram)
                            lst[k] = can_word
                            can_context = ' '.join(
                                lst
                            )  # candidate_word replace ngram target_word
                            le_lst[k] = lemma_can_word
                            le_context = ''.join(le_lst)
                            t += 1
                            if can_context in Goole_grams:
                                m += 1
                                fre_can_word = float(Goole_grams[can_context])
                                max_context = max(max_context, fre_can_word)
                            elif le_context in Goole_grams:
                                p += 1
                                fre_can_word = float(Goole_grams[can_context])
                                max_context = max(max_context, fre_can_word)
                        # change strategies for calculating 1gram, 2gram, 3gram, or their combination
                        ranks[can_word] = (
                            1 - beta) * can_word_value + beta * math.sqrt(
                                max_context / float(max_fre))
                    sorted_ranks = sorted(ranks.items(),
                                          key=operator.itemgetter(1),
                                          reverse=True)  # sort by rank value
                    ranked_result.append((text[0], text[2], sorted_ranks))

                    # print ranked_result

                else:
                    ranked_result.append((text[0], text[2], can_words))
            # get output data directory
            fout1 = fin3_each.replace(
                ".csv", "_Rank" + str(n) + "gram+" + str(beta) + ".csv")
            ufile.write_csv(fout1, ranked_result)
            print ext_print('saved result into: %s' % fout1)

    return True
Exemplo n.º 10
0
def extract_variables (fdin, ffea, ffea2, var):
    # read input data
    if fdin is None or fdin =="": return False
    trials = ufile.read_csv (fdin)
    if trials is None or len(trials) <= 0:
        print ext_print ('input data error, please check either no such file or no data --- interrupting')
        return False
    print ext_print ('found a total of %d data items' % len(trials))
    
    # read feature list - domain knowledge
    if ffea is None or ffea =="": return False
    fea_dict_dk = ufile.read_csv_as_dict_with_multiple_items (ffea)
    if fea_dict_dk is None or len(fea_dict_dk) <= 0:
        print ext_print ('no feature data available --- interrupting')
        return False

    # get feature info
    features, feature_dict_dk = {}, {}
    if var == "All":
        features = fea_dict_dk
        del features["Variable name"]
    elif var in fea_dict_dk:
        features = {var:fea_dict_dk[var]}
    for key, value in fea_dict_dk.iteritems():
        names = value[0].lower().split('|')
        for name in names:
            if name.strip() != '': feature_dict_dk[name.strip()] =key

    # read feature list - UMLS (can be replaced by full UMLS)
    if ffea2 is None or ffea2 =="": return False
    fea_dict_umls = ufile.read_csv_as_dict (ffea2)
    if fea_dict_umls is None or len(fea_dict_umls) <= 0:
        print ext_print ('no feature data available --- interrupting')
        return False

    #load numeric feature list
    Valx_core.init_features()

    output = []
    for i in xrange(len(trials)):
        if i%1000 == 0:
            print ('processing %d' % i)
        # pre-processing eligibility criteria text
        text = Valx_core.preprocessing(trials[i][1]) # trials[i][1] is the eligibility criteria text
        (sections_num, candidates_num) = Valx_core.extract_candidates_numeric(text) # extract candidates containing numeric features
        for j in xrange(len(candidates_num)): # for each candidate
            exp_text = Valx_core.formalize_expressions(candidates_num[j]) # identify and formalize values
            (exp_text, key_ngrams) = Valx_core.identify_variable(exp_text, feature_dict_dk, fea_dict_umls) # identify variable mentions and map them to names
            (variables, vars_values) = Valx_core.associate_variable_values(exp_text)
            all_exps = []
            for k in xrange(len(variables)):
                curr_var = variables[k]
                curr_exps = vars_values[k]
                if curr_var in features:
                    fea_list = features[curr_var]
                    curr_exps = Valx_core.context_validation(curr_exps, fea_list[1], fea_list[2])                           
                    curr_exps = Valx_core.normalization(fea_list[3], curr_exps) # unit conversion and value normalization
                    curr_exps = Valx_core.hr_validation (curr_exps, float(fea_list[4]), float(fea_list[5])) # heuristic rule-based validation
                if len(curr_exps) > 0:
                    if var == "All" or var.lower() == curr_var.lower() or var.lower() in curr_var.lower(): all_exps += curr_exps                     
                 
            if len(all_exps) > 0: output.append((trials[i][0], sections_num[j], candidates_num[j], exp_text, str(all_exps).replace("u'", "'"))) # output result

    # output result
    fout = os.path.splitext(fdin)[0] + "_exp_%s.csv" % var
    ufile.write_csv (fout, output)
    print ext_print ('saved processed results into: %s' % fout)
    return True
Exemplo n.º 11
0
def Extract_nonGT(fdin, fout, fin_, fout_, c):

    #----------------------------------initialize and load supporting data
    # read input data
    all_texts = []
    if fdin is None or fdin == "":
        return False

    elif fdin.endswith(".txt"):
        all_texts = ufile.load_files(fdin)
        if all_texts is None or len(all_texts) <= 0:
            print ext_print(
                'input data error, please check either no such file or no data --- interrupting'
            )
            return
        print ext_print('found a total of %d data items' % len(all_texts))

        output = []
        for text in all_texts:
            text = text.lower()
            result = GAXer_Ggender(text)
            output.append(result)

        # output result
        if (fout is None) or (fout == ""):
            fout = os.path.splitext(fdin)[0] + "_gender.txt"

        ufile.write_file(fout, output, False)

    elif fdin.endswith(".csv"):
        all_texts = ufile.load_files(fdin)  # a specific file or a directory
        all_texts_ = ufile.load_files(fin_)  # a specific file or a directory
        if all_texts is None or len(all_texts) <= 0:
            print ext_print(
                'input data error, please check either no such file or no data --- interrupting'
            )
            return
        print ext_print('found a total of %d data items' % len(all_texts))

        output = []
        output_ = []
        i = 0
        cnt = 0
        cho = 0
        j = 100
        jump = int(j * random.random()) + 2
        goadList = {}
        for t in all_texts_:
            goadList[t[0]] = 1

        for texts in all_texts:
            if i % 1000 == 0:
                print ext_print('processing %d' % i)
            i += 1

            #             if str(texts[0])<>'NCT00002967':
            #                 continue
            cop = texts
            inclusive = texts[5].lower()
            inclusive = inclusive[0:inclusive.find('exclusi')]
            combine_texts = texts[2].lower() + ". " + texts[3].lower(
            ) + ". " + texts[4].lower() + ". " + inclusive
            pre_label = 'Biological ' + texts[1][0].upper() + texts[1][1:]
            result = GAXer_Ggender(combine_texts, pre_label)
            '''
            if 'Transgender' not in str(result):
                FindSame = texts[0] in goadList.keys()
                if not FindSame:
                    if cho==jump:
                        output_.append((cop[0],cop[1],cop[2],cop[3],cop[4],cop[5]))
                        cnt+=1
                        jump=int(j*random.random())+2
                        cho=0
                    cho+=1
            '''
            if 'Transgender' not in str(result):
                FindSame = texts[0] in goadList.keys()
                if not FindSame:
                    output_.append(
                        (cop[0], cop[1], cop[2], cop[3], cop[4], cop[5]))
                    cnt += 1
            if cnt == c:
                break

            if len(result) == 0 or (len(texts[1]) > 0 and len(result) == 1
                                    and pre_label in result):
                continue
            else:
                t = texts[0]
                t = t.replace('"', '')
                t = str(t)
                output.append((t, texts[1], str(result)))

        # output result
        if (fout is None) or (fout == ""):
            fout = os.path.splitext(fdin)[0] + "_gender.csv"

        ufile.write_csv(fout, output)
        ufile.write_csv(fout_, output_)

    print ext_print('saved processed results into: %s' % fout)

    print ext_print('all tasks completed\n')
    return True
Exemplo n.º 12
0
def GAXer_wrapper(fdin, fout=None):

    #----------------------------------initialize and load supporting data
    # read input data
    all_texts = []
    if fdin is None or fdin == "":
        return False

    elif fdin.endswith(".txt"):
        all_texts = ufile.load_files(fdin)
        if all_texts is None or len(all_texts) <= 0:
            print ext_print(
                'input data error, please check either no such file or no data --- interrupting'
            )
            return
        print ext_print('found a total of %d data items' % len(all_texts))

        output = []
        for text in all_texts:
            text = text.lower()
            result = GAXer_Ggender(text)
            output.append(result)

        # output result
        if (fout is None) or (fout == ""):
            fout = os.path.splitext(fdin)[0] + "_gender.txt"

        ufile.write_file(fout, output, False)

    elif fdin.endswith(".csv"):
        all_texts = ufile.load_files(fdin)  # a specific file or a directory
        if all_texts is None or len(all_texts) <= 0:
            print ext_print(
                'input data error, please check either no such file or no data --- interrupting'
            )
            return
        print ext_print('found a total of %d data items' % len(all_texts))

        output = []
        i = 0
        for texts in all_texts:
            if i % 1000 == 0:
                print ext_print('processing %d' % i)
            i += 1

            #             if str(texts[0])<>'NCT00002967':
            #                 continue
            inclusive = texts[5].lower()
            inclusive = inclusive[0:inclusive.find('exclusi')]
            #            combine_texts = texts[2].lower() + ". " + texts[3].lower() + ". " + texts[4].lower() + ". " + inclusive
            combine_texts = texts[3].lower() + ". " + texts[4].lower(
            ) + ". " + inclusive
            pre_label = 'Biological ' + texts[1][0].upper() + texts[1][1:]
            result = GAXer_Ggender(combine_texts, pre_label)
            #            print result
            #            if len(result)==0 or (len(texts[1])>0 and len(result)==1 and pre_label in result):
            if len(result) == 0:
                continue
            else:
                t = texts[0]
                t = t.replace('"', '')
                t = str(t)
                output.append((t, texts[1], str(result)))

        # output result
        if (fout is None) or (fout == ""):
            fout = os.path.splitext(fdin)[0] + "_gender.csv"

        ufile.write_csv(fout, output)

    print ext_print('saved processed results into: %s' % fout)

    print ext_print('all tasks completed\n')
    return True
Exemplo n.º 13
0
def compare_all(fin1, fdin2, fdin3):
    # read input data
    if fin1 is None or fin1 =="":
        return False
    texts = ufile.read_csv(fin1) # a specific file or a directory
    # read input data
    if fdin2 is None or fdin2 =="":
        return False
    EDBlist = ufile.load_files (fdin2) # a specific file or a directory
    # read input data
    if fdin3 is None or fdin3 =="":
        return False
    FreCorpus = ufile.read_file_dict_tokenized(fdin3, '\t')
    
    result = []
    words_sims = {}
    cur = 0
    for text in texts:
        cur += 1
        if len(text[2].split('.')) > 1:
            target_word, pos = text[2].split('.')[0], text[2].split('.')[1]
        else:
            target_word, pos = text[2], None
        print "%d of %d" % (cur, len(texts)), target_word
        simi_values = []
        if target_word not in words_sims:
            processed = []
            processed.append(target_word)
            # step 1 ============== 
            can_words =[]
            syn = wordnet.synsets(target_word)
            if len(syn) > 0:
                for l in syn[0].lemmas():
                    if l.name() not in can_words:
                        can_words.append(l.name())
            word_fre = {}
            for word_each in can_words:
                if word_each in EDBlist and word_each not in processed:
                    word_each_fre = 0 
                    if (word_each in FreCorpus):
                        word_each_fre = int(FreCorpus[word_each])
                    word_fre[word_each] = word_each_fre
                    processed.append(word_each)
            word_fre = sorted(word_fre.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value
            simi_values.extend(word_fre)
            # step 2 ==============  
            can_words =[]
            syn = wordnet.synsets(target_word)
            if len(syn) > 0:
                syn_word = syn[0].hypernyms()
                for l in syn_word:
                    if (l.pos() in ['v', 'n', 'a']):
                        for k in l.lemmas():
                            if k.name() not in can_words:
                                can_words.append(k.name())
            word_fre = {}
            for word_each in can_words:
                if word_each in EDBlist and word_each not in processed:
                    word_each_fre = 0 
                    if (word_each in FreCorpus):
                        word_each_fre = int(FreCorpus[word_each])
                    word_fre[word_each] = word_each_fre
                    processed.append(word_each)
            word_fre = sorted(word_fre.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value
            simi_values.extend(word_fre)
            # step 3 ==============  
            can_words =[]
            for syn in wordnet.synsets(target_word):
                for l in syn.lemmas():
                    if l.name() not in can_words:
                        can_words.append(l.name())
            word_fre = {}
            for word_each in can_words:
                if word_each in EDBlist and word_each not in processed:
                    word_each_fre = 0 
                    if (word_each in FreCorpus):
                        word_each_fre = int(FreCorpus[word_each])
                    word_fre[word_each] = word_each_fre
                    processed.append(word_each)
            word_fre = sorted(word_fre.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value
            simi_values.extend(word_fre)
            # step 4 ==============  
            can_words =[]
            for syn in wordnet.synsets(target_word):
                syn_word = syn.hypernyms()
                for l in syn_word:
                    if (l.pos() in ['v', 'n', 'a']):
                        for k in l.lemmas():
                            if k.name() not in can_words:
                                can_words.append(k.name())
            word_fre = {}
            for word_each in can_words:
                if word_each in EDBlist and word_each not in processed:
                    word_each_fre = 0 
                    if (word_each in FreCorpus):
                        word_each_fre = int(FreCorpus[word_each])
                    word_fre[word_each] = word_each_fre
                    processed.append(word_each)
            word_fre = sorted(word_fre.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value
            simi_values.extend(word_fre)                  
            #=================================
            words_sims[target_word] = simi_values
            print simi_values[:2]
        else:
            simi_values = words_sims[target_word]
        result.append((text[0], text[2], simi_values))
       
    # output result
    fout = os.path.splitext(fin1)[0] + "_4steps.csv"
    ufile.write_csv(fout, result)
    print 'saved result into: %s' % fout
    
    print ext_print ('all tasks completed\n')
    return True
Exemplo n.º 14
0
def compare_all(fin1, fdin2, method, threasholds):
    # read input data
    if fin1 is None or fin1 == "":
        return False
    texts = ufile.read_csv(fin1)  # a specific file or a directory
    # read input data
    if fdin2 is None or fdin2 == "":
        return False
    EDBlist = ufile.load_files(fdin2)  # a specific file or a directory

    threasholds = threasholds.split(';')

    # 过滤掉原型词与同词根的词
    porter_stemmer = PorterStemmer()
    wnl = WordNetLemmatizer()
    gold, fre = [], []
    for threashold in threasholds:
        result = []
        words_sims = {}
        start_time = time.time()
        cur = 0
        for text in texts:
            cur += 1
            for i in range(len(text[3].split(";"))):
                fre.append(text[3].split(";")[i].split(":")[1])
                gold.append(text[3].split(";")[i].split(":")[0])
            if len(text[2].split('.')) > 1:
                target_word, pos = text[2].split('.')[0], text[2].split('.')[1]
                stemming_tw = porter_stemmer.stem(target_word)
                lemma_tw = wordnet.morphy(target_word, pos=pos)
                #lemma_tw = wnl.lemmatize(target_word, pos)
                print lemma_tw

            else:
                target_word, pos = text[2], None
                stemming_tw = porter_stemmer.stem(target_word)
                lemma_tw = wordnet.morphy(target_word)
                #lemma_tw = wnl.lemmatize(target_word, pos)

            print("%d of %d" % (cur, len(texts)), target_word)
            simi_values = []

            if target_word not in words_sims:
                word_sim = {}
                for word2 in EDBlist:
                    stemming_cw = porter_stemmer.stem(word2)
                    lemma_word = wordnet.morphy(word2)
                    if word2 not in word_sim:
                        #if target_word !=word2:
                        if target_word != word2 and stemming_cw != stemming_tw and lemma_word != lemma_tw:
                            # simi_value=compare_allsynsets(method, target_word, word2, TWpos, SYNpos, pos)
                            # simi_value = compare_allsynsets(method, target_word, word2, TWpos, pos)
                            # simi_value = compare_allsynsets(method, target_word, word2, SYNpos)
                            simi_value = compare_allsynsets(
                                method, target_word, word2)
                            if simi_value > float(threashold):
                                word_sim[word2] = round(float(simi_value), 3)
                simi_values = sorted(word_sim.items(),
                                     key=operator.itemgetter(1),
                                     reverse=True)  # sort by rank value
                words_sims[target_word] = simi_values
            else:
                simi_values = words_sims[target_word]
            result.append((text[0], text[2], simi_values))
        print("--- %s seconds ---" % (time.time() - start_time))
        # output result
        fout = os.path.splitext(fin1)[0] + "_%s_%s.csv" % (method, threashold)
        # if SYNpos:
        #     fout = fout.replace(".csv", "_SYNpos.csv")
        # if TWpos:
        #     fout = fout.replace(".csv", "_TWpos.csv")
        ufile.write_csv(fout, result)
        print('saved result into: %s' % fout)

    print(ext_print('all tasks completed\n'))
    return True