class TestCase(): def __init__(self): self.extractor = Extractor().getInstance() def test_extractFromPage(self): '''Test method extract_from_source.''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() # prepare f = file("../test/example_google_page.txt", "r") html = f.read() f.close() # test models = self.extractor.extract_from_source(html) print "**:>> %s" % len(models) for model in models: print model.asDetailText(); print '-END TEST-'
class TestCase(): def __init__(self): self.extractor = Extractor().getInstance() def test_extractFromPage(self): '''Test method extract_from_source.''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() # prepare f = file("../test/example_google_page.txt", "r") html = f.read() f.close() # test models = self.extractor.extract_from_source(html) for model in models: print model print '-END TEST-' def test_getNodesByPersonName(self): '''Test method getNodesByPersonName.''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() e = Extractor() models = e.getNodesByPersonName('jie tang') for model in models: print model print '-END TEST-' def test_clean_title(self): html = '''<p><div class=gs_r><h3><a href="http://doi.ieeecomputersociety.org/10.110910.1109/ICDM.2001.989541" onmousedown="return clk(this.href,'','res','16')">CMAR: Accurate and efficient classification based on multiple class-association …</a></h3><span class="gs_ggs gs_fl"><b><a href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.24.9014&rep=rep1&type=pdf" onmousedown="return clk(this.href,'gga','gga','16')">psu.edu</a> <span class=gs_ctg>[PDF]</span></b></span><font size=-1><br><span class=gs_a>WLJ Han, J Pei - Proc. of IEEE-ICDM, 2001 - doi.ieeecomputersociety.org</span><br>Previous studies propose that associative classification has high classification accuracy and <br> strong flexibility at handling unstructured data. However, it still suffers from the huge set of mined <br> rules and sometimes biased classi- fication or overfitting since the classification is based <b> ...</b> <br><span class=gs_fl><a href="/scholar?cites=1090097156101892771&hl=en&num=100&as_sdt=2000">Cited by 511</a> - <a href="/scholar?q=related:o-odgJbNIA8J:scholar.google.com/&hl=en&num=100&as_sdt=2000">Related articles</a> - <a href="/scholar?cluster=1090097156101892771&hl=en&num=100&as_sdt=2000">All 28 versions</a></span></font> </div> <p><div class=gs_r><h3><a href="http://portal.acm.org/citation.cfm?id=347167" onmousedown="return''' models = self.extractor.extract_from_source(html) for model in models: print model print '- test done -' def test_debug_not_found(self): '''Debug Errors''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() pub_candidates = [] pub_candidates.append( Publication( -1, 2000, 'Formalizzazione e Ottimizzazione di Transazioni di modifica in CLP(AD)', "pubkey", -1, "authors", -5)) #---------------------------------------------------- pub_candidates = [] pub_candidates.append( Publication( -1, 2000, 'On the Use of Spreading Activation Methods in Automatic Information Retrieval', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Chairman\'s Message', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Introduction to Modern Information Retrieval', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Publications', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Die RISC-CISC Debatte', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Kryptologie', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Integritat in IT-Systemen', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Vollstandige Reduktionssysteme', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5)) matcher = PubMatcher.getInstance() extractor = Extractor.getInstance() query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print '%s pub, query: %s' % (len(used_pubs), query) all_models = extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub( used_pubs, all_models) for pub in pubs_found: print 'pubs found', pub print '-' * 100 for pub in pubs_notfound: print 'not found', pub print '- test done -' def test_pin_query(self): '''Test pin query''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() #---------------------------------------------------- pub_candidates = [] pub_candidates.append( Publication( -1, 2000, 'Language, Cohesion and Form Margaret Masterman (1910-1986) (Edited by Yorick Wilks, University of Sheffield), Cambridge University Press (Studies in natural language processing, edited by Steven Bird and Branimir Boguraev), 2005, x+312 pp; hardbound, ISBN 0-521-45489-1', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication( -1, 2000, 'Methodology and technology for virtual component driven hardware/software co-design on the system-level', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication( -1, 2000, 'From the Editor: Security Cosmology: Moving from Big Bang to Worlds in Collusion', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication( -1, 2000, 'XML for the Exchange of Automation Project Information', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication(-1, 2000, 'Editor\'s Notes', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication( -1, 2000, 'Integrating Mathematical and Symbolic Models Through AESOP: An Expert for Stock Options Pricing', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication( -1, 2000, 'Von Transaktionen zu Problemlosungszyklen: Erweiterte Verarbeitungsmodelle fur Non-Standard-Datenbanksysteme', "pubkey", -1, "authors", -5)) pub_candidates.append( Publication( -1, 2000, 'Schemazusammenfuhrungen mit Vorgaben: Eine Studie uber die STEP-Norm AP214 und Oracle\'s Flexfelder', "pubkey", -1, "authors", -5)) query, pubs = self.extractor.pinMaxQuery(pub_candidates) print query for pub in pubs: print pub
class TestPubMatcher: def __init__(self): self.matcher = PubMatcher() # # Test # def test_matchPub(self): self.extractor = Extractor().getInstance() pubdao = PublicationDao() person_id = 13419 person_name = 'jie tang' # Read sources from files all_models = {} for page in range(0, 3): filename = "".join((person_name, '_page_', str(page), '.html')) f = file(os.path.join(self.settings.source_dir, filename), 'r') html = f.read() models = self.extractor.extract_from_source(html) if models is not None: self.extractor._Extractor__merge_into_extractedmap( all_models, models) print 'Total found DEBUG %s items.' % len(all_models) # part 2 pubs = pubdao.getPublicationByPerson(person_id, self.settings.generation) printout = False if printout: for key, models in all_models.items(): print key, " --> ", models print '===================' for pub in pubs: print pub (pubs_matched, pubs_not_matched) = self.matchPub(pubs, all_models) print '- test done -', len(pubs_matched), len(pubs_not_matched) return pubs_not_matched def test_fetchByPubs(self, pubs): '''Test use a list of pubs that not found in person search''' print '-- test fetchByPubs %s pubs', len(pubs) new_pubs = [] for pub in pubs: new_pubs.append((pub, 'jie tang')) extractor = Extractor() extractor.getNodesByPubs(new_pubs) print '- test done -' def test_match_with_authors(self): data_test = (( '… DeSmedt, W Du, W <b>Kent</b>, MA Ketabchi, WA … - …, 1991 - doi.ieeecomputersociety.org', 'Rafi Ahmed,Philippe De Smedt,Weimin Du,William Kent,Mohammad A. Ketabchi,Witold Litwin,Abbas Rafii,Ming-Chien Shan' ), ( 'R Ahmed, P DeSmedt, W Du, W Kent, MA … - …, 1991 - doi.ieeecomputersociety.org', 'Rafi Ahmed,Philippe De Smedt,Weimin Du,William Kent,Mohammad A. Ketabchi,Witold Litwin,Abbas Rafii,Ming-Chien Shan' ), ( 'P Lyngbaek, W Kent - … on the 1986 international workshop on Object …, 1986 - portal.acm.org', 'Peter Lyngbak,William Kent' ), ( 'W Kent - Proceedings of the 8th Bristish National …, 1990 - fog.hpl.external.hp.com', 'William Kent' ), ( 'DE Neiman, DW Hildum, VR Lessef, T …', 'Daniel E. Neiman,David W. Hildum,Victor R. Lesser,Tuomas Sandholm' ), ( 'M Esmaili, R Safavi-Naini, J Pieprzyk', 'Mansour Esmaili,Reihaneh Safavi-Naini,Josef Pieprzyk' ), ('DH Fishman, J Annevelink, E Chow, T …', 'Daniel H. Fishman,Jurgen Annevelink,David Beech,E. C. Chow,Tim Connors,J. W. Davis,Waqar Hasan,C. G. Hoch,William Kent,S. Leichner,Peter Lyngbak,Brom Mahbod,Marie-Anne Neimat,Tore Risch,Ming-Chien Shan,W. Kevin Wilkinson' )) data_debug = (( 'DH Fishman, J Annevelink, E Chow, T …', 'Daniel H. Fishman,Jurgen Annevelink,David Beech,E. C. Chow,Tim Connors,J. W. Davis,Waqar Hasan,C. G. Hoch,William Kent,S. Leichner,Peter Lyngbak,Brom Mahbod,Marie-Anne Neimat,Tore Risch,Ming-Chien Shan,W. Kevin Wilkinson' ), ) for ga, da in data_debug: print "match: %s \n with: %s \n is: %s" % (ga, da, \ self.matcher.matchAuthors(ga, da, debug_output=True))
class TestPubMatcher: def __init__(self): self.matcher = PubMatcher() # # Test # def test_matchPub(self): self.extractor = Extractor().getInstance() pubdao = PublicationDao() person_id = 13419 person_name = 'jie tang' # Read sources from files all_models = {} for page in range(0, 3): filename = "".join((person_name, '_page_', str(page), '.html')) f = file(os.path.join(self.settings.source_dir, filename), 'r') html = f.read() models = self.extractor.extract_from_source(html) if models is not None: self.extractor._Extractor__merge_into_extractedmap(all_models, models) print 'Total found DEBUG %s items.' % len(all_models) # part 2 pubs = pubdao.getPublicationByPerson(person_id, self.settings.generation) printout = False if printout: for key, models in all_models.items(): print key, " --> ", models print '===================' for pub in pubs: print pub (pubs_matched, pubs_not_matched) = self.matchPub(pubs, all_models) print '- test done -', len(pubs_matched), len(pubs_not_matched) return pubs_not_matched def test_fetchByPubs(self, pubs): '''Test use a list of pubs that not found in person search''' print '-- test fetchByPubs %s pubs', len(pubs) new_pubs = [] for pub in pubs: new_pubs.append((pub, 'jie tang')) extractor = Extractor() extractor.getNodesByPubs(new_pubs) print '- test done -' def test_match_with_authors(self): data_test = ( ('… DeSmedt, W Du, W <b>Kent</b>, MA Ketabchi, WA … - …, 1991 - doi.ieeecomputersociety.org', 'Rafi Ahmed,Philippe De Smedt,Weimin Du,William Kent,Mohammad A. Ketabchi,Witold Litwin,Abbas Rafii,Ming-Chien Shan'), ('R Ahmed, P DeSmedt, W Du, W Kent, MA … - …, 1991 - doi.ieeecomputersociety.org', 'Rafi Ahmed,Philippe De Smedt,Weimin Du,William Kent,Mohammad A. Ketabchi,Witold Litwin,Abbas Rafii,Ming-Chien Shan'), ('P Lyngbaek, W Kent - … on the 1986 international workshop on Object …, 1986 - portal.acm.org', 'Peter Lyngbak,William Kent'), ('W Kent - Proceedings of the 8th Bristish National …, 1990 - fog.hpl.external.hp.com', 'William Kent'), ('DE Neiman, DW Hildum, VR Lessef, T …', 'Daniel E. Neiman,David W. Hildum,Victor R. Lesser,Tuomas Sandholm'), ('M Esmaili, R Safavi-Naini, J Pieprzyk', 'Mansour Esmaili,Reihaneh Safavi-Naini,Josef Pieprzyk'), ('DH Fishman, J Annevelink, E Chow, T …', 'Daniel H. Fishman,Jurgen Annevelink,David Beech,E. C. Chow,Tim Connors,J. W. Davis,Waqar Hasan,C. G. Hoch,William Kent,S. Leichner,Peter Lyngbak,Brom Mahbod,Marie-Anne Neimat,Tore Risch,Ming-Chien Shan,W. Kevin Wilkinson') ) data_debug = ( ('DH Fishman, J Annevelink, E Chow, T …', 'Daniel H. Fishman,Jurgen Annevelink,David Beech,E. C. Chow,Tim Connors,J. W. Davis,Waqar Hasan,C. G. Hoch,William Kent,S. Leichner,Peter Lyngbak,Brom Mahbod,Marie-Anne Neimat,Tore Risch,Ming-Chien Shan,W. Kevin Wilkinson'), ) for ga, da in data_debug: print "match: %s \n with: %s \n is: %s" % (ga, da, \ self.matcher.matchAuthors(ga, da, debug_output=True))
class TestCase(): def __init__(self): self.extractor = Extractor().getInstance() def test_extractFromPage(self): '''Test method extract_from_source.''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() # prepare f = file("../test/example_google_page.txt", "r") html = f.read() f.close() # test models = self.extractor.extract_from_source(html) for model in models: print model print '-END TEST-' def test_getNodesByPersonName(self): '''Test method getNodesByPersonName.''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() e = Extractor() models = e.getNodesByPersonName('jie tang') for model in models: print model print '-END TEST-' def test_clean_title(self): html = '''<p><div class=gs_r><h3><a href="http://doi.ieeecomputersociety.org/10.110910.1109/ICDM.2001.989541" onmousedown="return clk(this.href,'','res','16')">CMAR: Accurate and efficient classification based on multiple class-association …</a></h3><span class="gs_ggs gs_fl"><b><a href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.24.9014&rep=rep1&type=pdf" onmousedown="return clk(this.href,'gga','gga','16')">psu.edu</a> <span class=gs_ctg>[PDF]</span></b></span><font size=-1><br><span class=gs_a>WLJ Han, J Pei - Proc. of IEEE-ICDM, 2001 - doi.ieeecomputersociety.org</span><br>Previous studies propose that associative classification has high classification accuracy and <br> strong flexibility at handling unstructured data. However, it still suffers from the huge set of mined <br> rules and sometimes biased classi- fication or overfitting since the classification is based <b> ...</b> <br><span class=gs_fl><a href="/scholar?cites=1090097156101892771&hl=en&num=100&as_sdt=2000">Cited by 511</a> - <a href="/scholar?q=related:o-odgJbNIA8J:scholar.google.com/&hl=en&num=100&as_sdt=2000">Related articles</a> - <a href="/scholar?cluster=1090097156101892771&hl=en&num=100&as_sdt=2000">All 28 versions</a></span></font> </div> <p><div class=gs_r><h3><a href="http://portal.acm.org/citation.cfm?id=347167" onmousedown="return''' models = self.extractor.extract_from_source(html) for model in models: print model print '- test done -' def test_debug_not_found(self): '''Debug Errors''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() pub_candidates = [] pub_candidates.append(Publication(-1, 2000, 'Formalizzazione e Ottimizzazione di Transazioni di modifica in CLP(AD)', "pubkey", -1, "authors", -5)) #---------------------------------------------------- pub_candidates = [] pub_candidates.append(Publication(-1, 2000, 'On the Use of Spreading Activation Methods in Automatic Information Retrieval', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Chairman\'s Message', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Introduction to Modern Information Retrieval', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Publications', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Die RISC-CISC Debatte', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Kryptologie', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Integritat in IT-Systemen', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Vollstandige Reduktionssysteme', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Approximative Public-Key-Kryptosysteme', "pubkey", -1, "authors", -5)) matcher = PubMatcher.getInstance() extractor = Extractor.getInstance() query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print '%s pub, query: %s' % (len(used_pubs), query) all_models = extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models) for pub in pubs_found: print 'pubs found' , pub print '-' * 100 for pub in pubs_notfound: print 'not found' , pub print '- test done -' def test_pin_query(self): '''Test pin query''' print '-TEST-:', self.test_extractFromPage.__doc__.strip() #---------------------------------------------------- pub_candidates = [] pub_candidates.append(Publication(-1, 2000, 'Language, Cohesion and Form Margaret Masterman (1910-1986) (Edited by Yorick Wilks, University of Sheffield), Cambridge University Press (Studies in natural language processing, edited by Steven Bird and Branimir Boguraev), 2005, x+312 pp; hardbound, ISBN 0-521-45489-1', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Methodology and technology for virtual component driven hardware/software co-design on the system-level', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'From the Editor: Security Cosmology: Moving from Big Bang to Worlds in Collusion', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'XML for the Exchange of Automation Project Information', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Editor\'s Notes', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Integrating Mathematical and Symbolic Models Through AESOP: An Expert for Stock Options Pricing', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Von Transaktionen zu Problemlosungszyklen: Erweiterte Verarbeitungsmodelle fur Non-Standard-Datenbanksysteme', "pubkey", -1, "authors", -5)) pub_candidates.append(Publication(-1, 2000, 'Schemazusammenfuhrungen mit Vorgaben: Eine Studie uber die STEP-Norm AP214 und Oracle\'s Flexfelder', "pubkey", -1, "authors", -5)) query, pubs = self.extractor.pinMaxQuery(pub_candidates) print query for pub in pubs: print pub