示例#1
0
def _gatherMatrixData(filename):

    """
    This function utilizes the RecordHandler module to create and structure the
    data to populate the term-doc matrices. It currently also removes stopwords
    from the abstract.

    It takes the records' file name to gather data from.

    It returns a doc-term list on the form: [[PMID,[(term1,count1),...],...]
    """

    medlineDir=_medlineDir

    # Get the regex pattern that sanitize strings.
    sanitizer = sanitizeString()

    l = []
    records = RecordHandler.loadMedlineRecords(medlineDir, filename)
    fields = RecordHandler.readMedlineFields(records, ['AB','TI','MH'])
    for entry in fields.items():
        information=''
	# Get the title if any
        try:
		information=' '+entry[1]['TI']
        except:
		print 'Unable to find title in', entry[0]
	# Get the abstract if any
        try:
		information+=' '+entry[1]['AB']
        except:
		print 'Unable to find abstract in', entry[0]
	# Get all the mesh terms if any
	if 'MH' in entry[1]:
		for meshterm in entry[1]['MH']:
			information+=' '+meshterm

        # Sanitize the abstract
        information=sanitizer.sub(' ', information)
        # Remove english stopwords from the information
        information=FilterInterface.stopwordRemover(information)

        # OPTIONAL:
        # Stem the information
        if _stemmer: information=FilterInterface.porterStemmer(information)

        l.append(_wordCounter(entry[0],information))

    return l
示例#2
0
def testRH():

    path="/root/The_Hive/data_acquisition/medline_records"
    disease="Winkelman Bethge Pfeiffer syndrome.txt"


    records = RH.loadMedlineRecords(path,disease)

    fields = RH.readMedlineFields(records,['TI','MH', 'AB'])

    l = []

    

    for entry in fields.items():
        # Get the abstract
        try:
            information=entry[1]['TI']
        except:
            print 'Unable to find title in', entry[0]
            continue
        try:
            information+=entry[1]['AB']
        except:
            print 'Unable to find abstract in', entry[0]
            continue
        try:
            for meshterm in entry[1]['MH']:
                information+=' '+meshterm
        except:
            print 'Unable to find MeSH in', entry[0]
            continue
        # MESH GOES HERE

        # Sanitize the abstract
#        abstract=sanitizer.sub(' ', abstract)
        # Remove english stopwords from the abstract
 #       abstract=FilterInterface.stopwordRemover(abstract)

        # OPTIONAL:
        # Stem the abstract
  #      if _stemmer: abstract=FilterInterface.porterStemmer(abstract)

        l.append((entry[0],information))

    return l