Python TextCleaner示例，TextCleaner Python示例

示例#1

0

显示文件

文件： FacebookScrapper.py 项目： jaytouz/pyMboxAnalyser

 def add_theme(self):
     url_theme = []
     if self.title:
         text = ' '.join([self.title])
         text = TextCleaner.clean_text_for_analysis(text)
         for theme, theme_words in zip(self.dict_theme.keys(),
                                       self.dict_theme.values()):
             in_list = any(
                 TextCleaner.string_in_text(s, text) for s in theme_words)
             if in_list:
                 url_theme.append(theme)
     self.theme = url_theme

示例#2

0

显示文件

    def add_theme(self):
        url_theme = []
        title = self.title if self.title else ''
        description = self.description if self.description else ''

        text = ' '.join([title, description])
        text = TextCleaner.clean_text_for_analysis(text)
        for theme, theme_words in zip(self.dict_theme.keys(),
                                      self.dict_theme.values()):
            in_list = any(
                TextCleaner.string_in_text(s, text) for s in theme_words)
            if in_list:
                url_theme.append(theme)
        self.theme = url_theme

示例#3

0

显示文件

文件： SearchCases.py 项目： hmbachelor/bachelor

def analyseDiseaseTerms(M_coo):

    listOfDiseases=["Adrenoleukodystrophy  autosomal  neonatal form","Kleine Levin Syndrome"]
    listOfSymptoms=["Normally developed boy age 5, progessive development of talking difficulties, seizures, ataxia, adrenal insufficiency and degeneration of visual and auditory functions",
                    "Jewish boy age 16, monthly seizures, sleep aggressive and irritable when woken, highly increased sexual appetite and hunger"]

    sanitizer = TextCleaner.sanitizeString()

    M_lil=M_coo.tolil()

    count=0
    for disease in listOfDiseases:
        rowIndex=_diseaseHash[disease]

        termIndices=M_lil.getrow(rowIndex).nonzero()[1][1:]

        termList=[]
        for colIndex in termIndices:
            termList.append((M_lil[rowIndex,colIndex],revTermHashTable[colIndex]))

        termList.sort()
        termList.reverse()

        printout1=[]
        #for item in termList[:20]
        #    printout1.append(item[1])
        count=0
        newTermList=[]
        for item in termList:
            if len(item[1])>7: newTermList.append(item)
        for item in newTermList[:20]:
            printout1.append(item[1])

        print 'Top 20 terms:'
        print '---------------------'
        print printout1
        print "====================="

        printout2=[]
        symptoms=listOfSymptoms[count]
        symptoms = sanitizer.sub(' ', symptoms)
        symptoms = FilterInterface.stopwordRemover(symptoms)
        symptoms=FilterInterface.porterStemmer(symptoms)
        symptoms=SearchTermDoc._modifySearchString(symptoms)
        count+=1

        for symptom in symptoms:

            for term in termList:
                if term[1]==symptom: printout2.append((termList.index(term),symptom))
        print 'Ranks of searched symptoms:'
        print '---------------------'
        print printout2
        print "====================="
        print ''

示例#4

0

显示文件

文件： testPysparse.py 项目： hmbachelor/bachelor

def testMatrixPS(fullpath):

    totalTime1=time.time()

    termHashTable=_termHashTable
    pmidHashTable=_pmidHashTable

    print "Loading matrix.."
    t1=time.time()
    spmatrix.ll_mat_from_mtx(fullpath)
    t2=time.time()
    print "Loaded matrix in "+str(t2-t1)

    # Sanitize the search vector and convert it to a list of terms
    sanitizer=TextCleaner.sanitizeString()
    searchVector=[term.lower() for term in sanitizer.sub(' ', searchVector).split(' ') if term!='']

    # Look up hashes for terms
    hashedSearchTerms=[]
    for term in searchVector:
        try:
            termHash=termHashTable[term]
        except:
            print "Did not locate",term
            continue
        hashedSearchTerms.append(termHash)

    print "Search vector:",str(searchVector),". Corresponding hash:",str(hashedSearchTerms)

    # Locate columns containing the given terms
    colVectors={}
    t1=time.time()
    for termHash in hashedSearchTerms:
        colVectors[termHash]=M.col(termHash)
    t2=time.time()
    print "Found",len(colVectors),"column(s)"
    print "Located columns matrix in "+str(t2-t1)

    # Convert the matrix to a compressed-sparse-row matrix
    M=M.tocsr()

    # Get the rows expressed by the columns above
    rowVectors={}
    for item in colVectors.items():
        colHash=item[0]
        print "colhash: "+str(colHash)
        for pmidHash in item[1]:
            rowVectors[pmidHash]=M.getrow(pmidHash).nonzero()[0]

    totalTime2=time.time()

    print "Time elapsed in total: "+str(totalTime2-totalTime1)

    print "Number of vectors: "+str(len(rowVectors))

示例#5

0

显示文件

文件： SearchTermDoc.py 项目： hmbachelor/bachelor

def _modifySearchString(searchString):

    """
    Takes a search string and returns a list of sanitized search terms.
    """

    # Sanitize the search vector and convert it to a list of terms.
    sanitizer=TextCleaner.sanitizeString()
    searchVector=[term.lower() for term in sanitizer.sub(' ', searchString).split(' ') if term!='']

    return searchVector

示例#6

0

显示文件

def main():
    # Load the documents
    data = fileIO.load_json_file('../DataCollection/GirlsTinderProfiles.json')

    # Pre-process the documents
    clean_docs = TextCleaner.clean_text(data, 'bio')

    print('Total Profiles Collected %d:' % len(clean_docs))

    # Turn the list of lists into just a list of the documents
    docs = []
    for doc in clean_docs:
        bio = ''
        for word in doc:
            bio += word + ' '
        if doc:
            bio = bio[:-1]
            docs.append(bio)

    print('Non-empty Profiles: %d' % len(docs))

    # Defines
    CONCEPT_COUNT = 5
    TERM_COUNT = 8
    N_GRAM = (1, 2)

    vectorizer = get_tfifd_vectorizer(N_GRAM)
    feature_matrix = get_tfidf_matrix(vectorizer, docs)
    features = get_feature_names(vectorizer)

    lsa = get_lsa_model(CONCEPT_COUNT)
    get_features(lsa, feature_matrix)

    for i, component in enumerate(lsa.components_):
        terms_in_comp = zip(features, component)
        sorted_terms = sorted(terms_in_comp, key=lambda x: x[1], reverse=True)[:TERM_COUNT]
        print('Concept %d:' % i)
        for term in sorted_terms:
            print(term[0])
            print(term)
        print(' ')

示例#7

0

显示文件

文件： SearchCases.py 项目： hmbachelor/bachelor

def searchDisease(M_lil, M_csc, queryString, top=20):

    """
    This function is still a work in progress..
    """

    sanitizer = TextCleaner.sanitizeString()
    queryString = sanitizer.sub(' ', queryString)
    
    # OPTIONAL:
    # Stem the information
    if _stemmer:
        # Get the regex pattern that sanitizeses information and sanitize it
        # Stem the information
        queryString = FilterInterface.porterStemmer(queryString)

    # CHOOSE HEURISTIC:
    # Search-heuristic used to retrieve the list of results
    if _cosineMeasure:
        results = SearchInterface.sumMeasure(M_lil, M_csc, queryString)
    else:
        results = SearchInterface.sumMeasure(M_lil, M_csc, queryString)

    # Sort the results and reverse to get the highest score first
    results.sort()
    results.reverse()

    resultDic = {}
    for item in results[:top]:
        pmid = item[1]
        label = _labelHash[pmid]
        resultDic[label] = item[0]

    resultList = sorted(resultDic.items(), key=lambda(k, v):(v, k), reverse=True)

    return resultList[:20]

示例#8

0

显示文件

文件： TermDoc.py 项目： hmbachelor/bachelor

def createTermAndPmidHashes():

    """
    This function creates two hash tables of the PMID's and terms to be used
    for the term-doc matrix.

    Note that the terms are sanitized for any non-alphanumerical characters.
    And it is default to remove stop words.
    """

    medlineDir = _medlineDir
    hashTables = _hashTablesDir
    termHashTable={}
    pmidHashTable={}
    termCounter = 0
    pmidCounter = 0

    files = IOmodule.getSortedFilelist(medlineDir+'/')
#    files = sorted([f for f in os.listdir(medlineDir+"/") if os.path.isfile(medlineDir+"/"+f)])

    # Get the regex pattern that sanitizeses strings.
    sanitizer = TextCleaner.sanitizeString()

    for file in files:
        records = RecordHandler.loadMedlineRecords(medlineDir, file)

        # *Note*
        # Parts of the following loops could be optimized by using dictionaries
        # for direct loopkups instead of linear lookups, but since it's not
        # important, optimization will have to wait for another day.

        # Hash PMID's
        for diseaseRecords in records.values():
            for record in diseaseRecords:
                pmid=record[0]
                if pmid not in pmidHashTable:
                    pmidCounter+=1
                    pmidHashTable[pmid]=pmidCounter

                information=''
                # Get the abstract
		try:
			information=' '+record[1]['AB']
		except:
			print 'Unable to get abstract', record[0]
		try:
			information+=' '+record[1]['TI']
		except:
			print 'Unable to get title for', record[0]

		if 'MH' in record[1]:
			for meshterm in record[1]['MH']:
				information+=' '+meshterm
		# We do not want to print this, as most of the
		# records do not have MeSH.
		# print 'Unable to get MeSH terms for', record[0]
		
                # Sanitize the information
                information=sanitizer.sub(' ', information)
                # remove stopwords from the abstract
                information=FilterInterface.stopwordRemover(information)

                # OPTIONAL:
                # Stem the abstract
                if _stemmer: information=FilterInterface.porterStemmer(information)

                termList = [word for word in information.split(' ') if word != '']
                for term in termList:
                    if term not in termHashTable:
                        termCounter+=1
                        termHashTable[term]=termCounter
                    else: continue
                
        print str(termCounter)+" terms hashed. "+str(pmidCounter)+" pmids hashed."

    IOmodule.pickleOut(hashTables, _termHash,"btd", termHashTable)
    IOmodule.pickleOut(hashTables, _pmidHash,"btd", pmidHashTable)

    return termHashTable, pmidHashTable

示例#9

0

显示文件

Created on Sat Oct 27 19:19:55 2018
@author: oxygen0605
"""

import TextCleaner
import Tokenizer
import TextNormalizer
import StopwordRemover

if __name__ == "__main__":
    fileobj = open("source/sample.html", "r", encoding="utf_8")
    text = fileobj.read()
    fileobj.close()

    #形態素解析をしやすくするためのクリーニング
    tcleaner = TextCleaner.TextCleaner()
    text = tcleaner.remove_header(text)
    text = tcleaner.clean_html_and_js_tags(text)
    text = tcleaner.clean_url(text)
    text = tcleaner.clean_code(text)
    text = tcleaner.clean_text(text)
    tcleaner.output(text)

    tokenizer = Tokenizer.JanomeTokenizer()
    words = tokenizer.wakati(text)
    #words = tokenizer.filter_by_pos(text, pos=('名詞'))
    tokenizer.output(words)

    #MeCab
    #tokenizer = Tokenizer.MeCabTokenizer()
    #words = tokenizer.wakati(text)

示例#10

0

显示文件

文件： DiseaseCrawler.py 项目： hmbachelor/bachelor

import urllib2
from BeautifulSoup import *
from urlparse import urljoin
from time import strftime, sleep
import TextCleaner
import IOmodule
import os

# Get compiled regexps
removeTags=TextCleaner.removeHTMLTags()
removeNBSPs=TextCleaner.removeNPSBs()
removeRefs=TextCleaner.removeReferences()
removeSlashes=TextCleaner.removeSlashes()
removeCommas=TextCleaner.removeCommas()
removeWhitespaces=TextCleaner.removeWhitespaces()

# Path to main folder
_mainFolder=os.getenv("HOME")+"/"+"The_Hive"
# Path the phase subfolder
_subFolder =_mainFolder+"/"+"data_acquisition"
# Downloaded diseases
diseaseFolder="rarediseases_info"
# Error log
errorFolder="URL_error_log"

# Create main folder if it doesn't already exist..
if not os.path.isdir(_mainFolder):
        os.mkdir(_mainFolder)

# Create sub folder if it doesn't already exist..
if not os.path.isdir(_subFolder):

示例#11

0

显示文件

文件： PubmedSIR.py 项目： hmbachelor/bachelor

def getArticleIDs(diseaseDictionary):

    """
    Takes a dictionary of the form:
    {'disease xx': {'syn' : [xx, yy], 'term' : string, 'uid' : string,
    'description' : string }, etc}}

    And returns a dictionary containing:
    {'disease a': [pmid1, pmid2, pmid3...], 'disease b' : [pmidx,
    pmidy,...], ...}

    Where disease xx is the name of the disease, syn is a list of
    synonyms, term is a hand crafted search term (if it exists),
    description is the description from the rarediseases.info (if it
    exists).

    Duplicate PMIDs are removed.
    """

    # Iterates through the diseaseDictionary and searches for uid,
    # term, diseasename, and combinations of synonyms
    diseaseArticleIDlist={}
    additionalSearchOptions = ' AND hasabstract[text]' # Contains additional options, e.g. ' AND LA[eng]'

    for disease in diseaseDictionary:

        print
        print 'Processing:', disease
        articleCount=250
        diseaseArticleIDlist[disease] = {}
        diseaseArticleIDlist[disease]['PMIDs']=[]
        diseaseArticleIDlist[disease]['description'] = ''
        if (diseaseDictionary[disease]['terms'] != ''):
            diseaseArticleIDlist[disease]['PMIDs'].extend(getArticleIDsFromMultiSource(diseaseDictionary[disease]['db'],'',TC.unquoteString(diseaseDictionary[disease]['terms']) + additionalSearchOptions,articleCount))
        elif (diseaseDictionary[disease]['uid'] != ''):
            diseaseArticleIDlist[disease]['PMIDs'].extend(getArticleIDsFromMultiSource(diseaseDictionary[disease]['db'],diseaseDictionary[disease]['uid'],'',articleCount))
            
        articleCount = articleCount - len(diseaseArticleIDlist[disease]['PMIDs'])

        # If we still have qouta left
        if (articleCount > 0):
            # Search for the disease name on pubmed/medline
            diseaseArticleIDlist[disease]['PMIDs'].extend(getArticleIDsFromMultiSource('pubmed','',disease + additionalSearchOptions,articleCount))

        # Remove duplicates
        diseaseArticleIDlist[disease]['PMIDs'] = removeDuplicates(diseaseArticleIDlist[disease]['PMIDs'])

        # diseaseArticleIDlist, should contain about 250 PMIDs by now,
        # but we have a max limit on 500 records, therefore we wish to
        # fill up with the other search form now. (By searching on
        # synonyms)
        articleCount = 500 - len(diseaseArticleIDlist[disease]['PMIDs'])

        # Translate the special signs contained in some synonyms
        diseaseDictionary[disease]['syn']=[TC.decodeURLcharacters(TC.unquoteString(i)).encode('utf-8','ignore') for i in diseaseDictionary[disease]['syn']]

        # Create a set of all combinations of synonyms and save it in
        # 'optimizedSynonymList'
        synonymArticleIDlist={}
        optimizedSynonymList = sorted(STC.searchTermCombiner(diseaseDictionary[disease]['syn'], additionalSearchOptions,1))

        # Go though the list of synonyms, download corresponding PMIDs
        # from Pubmed and delete synonyms not returned any PMIDs (and
        # all combinations containing this synonym!).  Note that the
        # sorted nature of the the list of tuples in
        # 'optimizedSynonymList' allows us to delete post-indices
        # while iterating through it without getting indexs errors or
        # missing any steps.
        print "================================================"
        print "Total number of synonyms:",len(optimizedSynonymList)
        printcount=len(optimizedSynonymList)
        for synTuple in optimizedSynonymList:
            synonym=synTuple[1]
            print "Gathering data from: \""+synonym+"\""
            synonymArticleIDlist[synonym]=[]

            # We don't need to get more medline records than can be
            # crammed into the 500 - primary search, so we use the
            # articleCount as paramter. Hopefully there will be lots
            # of small results that get used up first.
            synonymArticleIDlist[synonym].extend(getArticleIDsFromMultiSource('pubmed', '', synonym, articleCount))
            if len(synonymArticleIDlist[synonym])==0:
                tempList=optimizedSynonymList[:]
                for syn in tempList:
                    shortenedSyn=synonym[0:(len(synonym)-len(additionalSearchOptions))]
                    if (synonym != syn[1]) and (shortenedSyn in syn[1]):
                        print "Deleted: "+str(syn)
                        optimizedSynonymList.remove(syn)
                        printcount-=1
                    if (synonym == syn[1]):
                        printcount-=1
                print "-------",printcount,"remaining -------"
            else:
                print "Done with "+str(synTuple)
                printcount-=1
                print "-------",printcount,"remaining -------"
        print "================================================"

        print 'Sorting items accoring to their count by values()'
        # Needs to get the list sorted on the amount of return IDs.
        items = [(len(v),v,k) for k,v in synonymArticleIDlist.items()]
        # Sort according to the length of the lists withing the tuples
        items.sort()
        # Reverse to get largest to smallest in list
        items.reverse()

        listIsEmpty = False

        # Gonna make a list of maximum 250 PMIDs from the synonym
        # list, without duplicates. As set contains unordered, unique
        # elements, we use a set to collect the PMIDs
        collectionSet = set()

        # We might consider changing this, because right now, we check
        # twice to see whether collectionSet exceeds articleCount
        while len(collectionSet) <= articleCount or not listIsEmpty:

            # Is the list of items empty, then we can't do anything.
            if items == []:
                listIfEmpty = True
                break

            transferTuple = items.pop()

            if transferTuple[0] == 0:
                continue

            for pmid in transferTuple[1]:
                # Test to see if len of the set is articleCount?
                if (len(collectionSet) + articleCount) > 499:
                    break
                collectionSet.add(pmid)

        # Print some useful information about the number of returned
        # articles from each of the search types
        print 'Hand crafted / Disease name search returned:', str(len(diseaseArticleIDlist[disease]['PMIDs'])), 'results'
        print 'Synonym search returned:', str(len(collectionSet)), 'results'
        print 'Total number of results:', str(len(collectionSet) + len(diseaseArticleIDlist[disease]['PMIDs']))

        diseaseArticleIDlist[disease]['PMIDs'].extend(list(collectionSet))

        # This might be unusable information, might consider removing it.
        if len(diseaseArticleIDlist[disease]['PMIDs']) == 0:
            print 'The disease:', disease + ' did not return any results.'
        elif len(diseaseArticleIDlist[disease]['PMIDs']) != 500:
            print 'We did not succeed in getting the default number of records (500), however we did get ', len(diseaseArticleIDlist[disease]['PMIDs'])
        
        # Copy over the disease description, if none is present, the
        # value will be empty string ''
        diseaseArticleIDlist[disease]['description'] = diseaseDictionary[disease]['desc']

    return diseaseArticleIDlist

示例#12

0

显示文件

 def parse_description(self):
     if self.pytube_inst:
         description = self.pytube_inst.description
         description = TextCleaner.standardize_text(description)
         self.description = description

示例#13

0

显示文件

 def parse_title(self):
     if self.pytube_inst:
         title = self.pytube_inst.title
         title = TextCleaner.standardize_text(title)
         self.title = title

示例#14

0

显示文件

文件： SearchCases.py 项目： hmbachelor/bachelor

def runScoreTestBlind_diseaseMatrix(lil, csc):

    top = 3000

    """
    ============================================================================
    1) Dreng, normal ved fdslen bortset fra deformitet af begge
    storeter (de manglede et led). Udvikler sig normalt
    efterflgende. Ved 5 rs alderen


    der viser knoglevv uden malignitetstegn. Kort tid efter biopsien
    udvikles mere knoglevkst, prcis der hvor man har skret.
    ----------------------------------------------------------------------------
    System symptom query: Boy, normal birth, deformity of both big
    toes (missing joint), quick development of bone tumor near spine
    and osteogenesis at biopsy.
    ============================================================================
    2) Normally developed boy until age 5, where he progressively
    developed the following symptoms: Talking difficulties, seizures,
    ataxia, adrenal insufficiency and degeneration of visual and
    auditory functions.
    ----------------------------------------------------------------------------
    System symptom query: Normally developed boy age 5, progessive
    development of talking difficulties, seizures, ataxia, adrenal
    insufficiency and degeneration of visual and auditory functions
    ============================================================================
    3) A boy age 14 comes to the doctor with yellow, keratotic plaques
    on the skin of his palms and soles going up onto the dorsal
    side. Both hands and feet are affected.
    
    He equally had swollen and very vulnerable gums since the age of 4
    with loss of most of his permanent teeth.
    ----------------------------------------------------------------------------
    System symptom query: Boy age 14, yellow, keratotic plaques on the
    skin of palms and soles going up onto the dorsal side. Both hands
    and feet are affected.
    ============================================================================
    4) 16-aarig joedisk dreng har en til to gange om maaneden anfald,
    hvor han foerst og fremmest skal sove utroligt meget - ca. 18
    timer om dagen.  Anfaldene varer ca en uges tid. Han aendrer
    karakter under anfaldene og bliver irritabel og aggressiv, naar
    han vaekkes. Naar han er vaagen i anfaldsperioden spiser han helt
    utroligt store maengder mad, og hans appetit paa sex er endvidere
    abnormt stor.
    ----------------------------------------------------------------------------
    System symptom query: Jewish boy age 16, monthly seizures, sleep
    deficiency, aggressive and irritable when woken, highly increased
    sexual appetite and hunger.

    ============================================================================
    5) The patient is a male child presenting at birth with numerous
    malformations. He had midfacial retraction with a deep groove
    under the eyes, and hypertelorism. A short nose with a low nasal
    bridge and large low-set ears were noted. He had a wide mouth and
    retrognathia.  Hypertrichosis with bright reddish hair and a
    median frontal cutaneous angioma were present. The neck was short
    with redundant skin. Bilateral inguinal hernias, hypospadias with
    a megameatus, and cryptorchidism were noted.
    ----------------------------------------------------------------------------
    System symptom query: Male child, malformations at birth,
    midfacial retraction with a deep groove under the eyes, and
    hypertelorism, short nose with a low nasal bridge and large
    low-set ears, wide mouth and retrognathia. Hypertrichosis with
    bright reddish hair and a median frontal cutaneous angioma, short
    neck with redundant skin, Bilateral inguinal hernias, hypospadias
    with a megameatus, and cryptorchidism
    ============================================================================
    """

    #diseaseList=[("Boy, normal birth, deformity of both big toes
    #             (missing joint), quick development of bone tumor
    #             near spine and osteogenesis at biopsy"), ("Normally
    #             developed boy age 5, progessive development of
    #             talking difficulties, seizures, ataxia, adrenal
    #             insufficiency and degeneration of visual and
    #             auditory functions"), ("Boy age 14, yellow keratotic
    #             plaques on the skin of palms and soles going up onto
    #             the dorsal side. Both hands and feet are
    #             affected. swollen vulnerable gums, loss of permanent
    #             teeth.")]

    #diseaseList=[("Jewish boy age 16, monthly seizures, sleep
    #deficiency, aggressive and irritable when woken, highly increased
    #sexual appetite and hunger")]

    #diseaseList=[("Normally developed boy age 5, seizures, ataxia,
    #adrenal insufficiency and degeneration of visual and auditory
    #functions")]

    #diseaseList=[("Male child, malformations at birth, midfacial
    #retraction with a deep groove under the eyes, and hypertelorism,
    #short nose with a low nasal bridge and large low-set ears, wide
    #mouth and retrognathia. Hypertrichosis with bright reddish hair
    #and a median frontal cutaneous angioma, short neck with redundant
    #skin, Bilateral inguinal hernias, hypospadias with a megameatus,
    #and cryptorchidism")]

    #diseaseList=[("Schinzel Giedion syndrome","Male child,
    #malformations at birth, midfacial retraction with a deep groove
    #under the eyes, and hypertelorism, short nose with a low nasal
    #bridge and large low-set ears, wide mouth and
    #retrognathia. Hypertrichosis with bright reddish hair and a
    #median frontal cutaneous angioma, short neck with redundant skin,
    #Bilateral inguinal hernias, hypospadias with a megameatus, and
    #cryptorchidism")]
    
    printout2 = [[], [], []]
    clusterThis = [[], [], []]

    sanitizer = TextCleaner.sanitizeString()
    count = 0
    for disease in diseaseList:

        queryString = sanitizer.sub(' ', disease)

        symptoms = FilterInterface.stopwordRemover(queryString)

        resultLists = searchDisease(lil, csc, symptoms, top)

        printout2[count].append(resultLists)
        clusterThis[count].append(resultLists)
        count += 1

    for list in printout2:
        print list
    print "TEST DONE"

    return clusterThis, printout2

示例#15

0

显示文件

文件： SearchCases.py 项目： hmbachelor/bachelor

def search(M_lil, M_csc, queryString, top=20):

    """
    This function is still a work in progress..
    """
    
    sanitizer = TextCleaner.sanitizeString()
    queryString = sanitizer.sub(' ', queryString)

    # OPTIONAL:
    # Stem the information
    if _stemmer:
        # Get the regex pattern that sanitizeses information and sanitize it
        # Stem the information
        queryString = FilterInterface.porterStemmer(queryString)

    # CHOOSE HEURISTIC:
    # Search-heuristic used to retrieve the list of results
    if _cosineMeasure:
        results = SearchInterface.cosineMeasure(M_lil, M_csc, queryString)
    else:
        results = SearchInterface.sumMeasure(M_lil, M_csc, queryString)

    # Sort the results and reverse to get the highest score first
    results.sort()
    results.reverse()

    # ###########################################################################
    # ### For the term-doc matrix: ##############################################

    # ###########
    # # 1: Mean #
    # ###########

    # # Get the sum cosine score the labels
    # ## (normDic counts the number of times a label has been summed)
    resultDic1 = {}
    normDic1 = {}
    for item in results[:top]:
        pmid = item[1]
        # Get the labels linked to the PMID
        ## (Several labels can be linked to one PMID)
        labels = _labelHash[pmid]
        for label in labels:
            try:
                resultDic1[label] += item[0]
                normDic1[label] += 1
            except:
                resultDic1[label] = item[0]
                normDic1[label] = 1

    # #############
    # # 2: Median #
    # #############

    # # Get the median cosine score of the labels
    # ## (normDic counts the number of times a label has been summed)
    resultDicList2 = {}
    normDic2 = {}
    for item in results[:top]:
        pmid = item[1]
        # Get the labels linked to the PMID
        ## (Several labels can be linked to one PMID)
        labels = _labelHash[pmid]
        for label in labels:
            try:
                resultDicList2[label].append(item[0])
                normDic2[label] += 1
            except:
                resultDicList2[label] = []
                resultDicList2[label].append(item[0])
                normDic2[label] = 1
    resultDic2 = {}
    for label in resultDicList2.keys():
        labelList = resultDicList2[label]
        numOfScores = len(labelList)
        if numOfScores > 2:
            medianIndex = numOfScores / 2
        else:
            medianIndex = 0
        resultDic2[label] = sorted(labelList)[medianIndex]

    # ##########
    # # 3: Max #
    # ##########

    # # Get the max cosine score of labels
    # ## (normDic counts the number of times a label has been summed)
    resultDicList3 = {}
    normDic3 = {}
    for item in results[:top]:
        pmid = item[1]
        # Get the labels linked to the PMID
        ## (Several labels can be linked to one PMID)
        labels = _labelHash[pmid]
        for label in labels:
            try:
                resultDicList3[label].append(item[0])
                normDic3[label] += 1
            except:
                resultDicList3[label] = []
                resultDicList3[label].append(item[0])
                normDic3[label] = 1
    resultDic3 = {}
    for label in resultDicList3.keys():
        labelList = resultDicList3[label]
        resultDic3[label] = max(labelList)

    # # Normalize the summed labels
        #for label in resultDic1.keys():
        #    resultDic1[label]/=normDic1[label]
        #for label in resultDic2.keys():
        #    resultDic2[label]/=normDic2[label]
        #for label in resultDic3.keys():
        #    resultDic3[label]/=normDic3[label]

        ###############################################################################

            ###################################
            ####### return pmid results #######

    # Reverse and sort the concensus list
    resultList_mean = sorted(resultDic1.items(), key=lambda(k, v):(v, k), reverse=True)
    resultList_median = sorted(resultDic2.items(), key=lambda(k, v):(v, k), reverse=True)
    resultList_max = sorted(resultDic3.items(), key=lambda(k, v):(v, k), reverse=True)

    return [resultList_mean, resultList_median, resultList_max]

Python TextCleaner, lightnovel-crawler示例