예제 #1
0
def make_kw_approach(filename):
    #Load abstracts
    print "Reading data file..."
    read_abstracts_from_xml("pubmed_result.xml")
    global abstracts

    bacteria_list = []
    bacteria_list = read_txt_file(filename)
    i=0
    for abstract in abstracts :
        kwe.make_keyword_candidates(abstract.AbstractText)
        kwe.make_cooccurrence_matrix()
        abstract.key_words = kwe.make_keywords_list()
        #print i
        #print abstract.key_words
        #Tie bacteria & abstracts
        #Tie keywords to bacteria: key_word<abstract_id, abstract_id, ...>
        for bacteria in bacteria_list :
            if abstract.AbstractText.rfind( bacteria.name ) != -1 :
                #print bacteria.name
                #print abstract.AbstractText
                bacteria.Freq += 1
                for keyword in abstract.key_words :
                    try :
                        bacteria.key_words[keyword].append(abstract.id)
                    except :
                        bacteria.key_words[keyword] = []
                        bacteria.key_words[keyword].append(abstract.id)
            
            
        #i+=1
        #if i == 2 : break

    print "Done with extraction keywords"
    
    #for bacteria in bacteria_list :
    #    print bacteria.name
    #    print bacteria.key_words
    #Adjacency matric, strong connections
    adj_matrix = []
    for i in xrange(len(bacteria_list)):
        adj_matrix.append([])
        for j in xrange(len(bacteria_list)):
            adj_matrix[i].append(0)

    for i in xrange( len(bacteria_list) ) :
        for j in xrange( (i+1), len(bacteria_list) ) :
            for kw in bacteria_list[i].key_words :
                for kww in bacteria_list[j].key_words :
                    if kw == kww :
                        skw = set()
                        skw = set(list(kw))
                        skww = set()
                        skww = set(list(skww))
                        if len(skw.symmetric_difference(skww)) > 0 :
                            adj_matrix[i][j] += 1

    print_adj_matrix_to_csv1(bacteria_list, adj_matrix, 'adj_matrix_strong.csv')

    for bacteria in bacteria_list :
        with open(str(bacteria.name)+".csv", 'wb') as csvfile :
            spamwriter = csv.writer(csvfile)
            #print bacteria.name
            for item in bacteria.key_words :
                spamwriter.writerow( str(item) )
예제 #2
0
def calculate_index_words(text):
    #print text
    kwe.make_keyword_candidates(text)
    kwe.make_cooccurrence_matrix()
    return kwe.make_keywords_list()
        
예제 #3
0
def make_kw_approach(filename):
    #Load abstracts
    print "Reading data file..."
    read_abstracts_from_xml("pubmed_result.xml")
    global abstracts

    bacteria_list = []
    bacteria_list = read_txt_file(filename)
    i = 0
    for abstract in abstracts:
        kwe.make_keyword_candidates(abstract.AbstractText)
        kwe.make_cooccurrence_matrix()
        abstract.key_words = kwe.make_keywords_list()
        #print i
        #print abstract.key_words
        #Tie bacteria & abstracts
        #Tie keywords to bacteria: key_word<abstract_id, abstract_id, ...>
        for bacteria in bacteria_list:
            if abstract.AbstractText.rfind(bacteria.name) != -1:
                #print bacteria.name
                #print abstract.AbstractText
                bacteria.Freq += 1
                for keyword in abstract.key_words:
                    try:
                        bacteria.key_words[keyword].append(abstract.id)
                    except:
                        bacteria.key_words[keyword] = []
                        bacteria.key_words[keyword].append(abstract.id)

        #i+=1
        #if i == 2 : break

    print "Done with extraction keywords"

    #for bacteria in bacteria_list :
    #    print bacteria.name
    #    print bacteria.key_words
    #Adjacency matric, strong connections
    adj_matrix = []
    for i in xrange(len(bacteria_list)):
        adj_matrix.append([])
        for j in xrange(len(bacteria_list)):
            adj_matrix[i].append(0)

    for i in xrange(len(bacteria_list)):
        for j in xrange((i + 1), len(bacteria_list)):
            for kw in bacteria_list[i].key_words:
                for kww in bacteria_list[j].key_words:
                    if kw == kww:
                        skw = set()
                        skw = set(list(kw))
                        skww = set()
                        skww = set(list(skww))
                        if len(skw.symmetric_difference(skww)) > 0:
                            adj_matrix[i][j] += 1

    print_adj_matrix_to_csv1(bacteria_list, adj_matrix,
                             'adj_matrix_strong.csv')

    for bacteria in bacteria_list:
        with open(str(bacteria.name) + ".csv", 'wb') as csvfile:
            spamwriter = csv.writer(csvfile)
            #print bacteria.name
            for item in bacteria.key_words:
                spamwriter.writerow(str(item))
예제 #4
0
def calculate_index_words(text):
    #print text
    kwe.make_keyword_candidates(text)
    kwe.make_cooccurrence_matrix()
    return kwe.make_keywords_list()