def finish(self): self.progressBar.setMaximum(1) self.progressBar.setValue(1) log.info('Finished importing. Results can be found in the Manage page' ) #@UndefinedVariable log.removeHandler(self.guihandler) #@UndefinedVariable self.empty_edit.setText('Done!')
def finish(self): log.info('Finished extracting. Results can be found in the Manage ' #@UndefinedVariable 'page') # Stop the thread before jumping to next page self.thread.exiting = True log.removeHandler(self.guihandler) #@UndefinedVariable self.empty_edit.setText('Done!')
def persist_file_references(self, file_path): """ Parses references from a file and stores them to the database """ extraction_gw = ExtractionGateway() references = self._parse_entries_file(file_path) extractions = [] for reference, index in zip(references, range(len(references))): extraction = Extraction() # Clean fields that we don't want reference.remove_field('reference_id') reference.remove_field('abstract') reference.remove_field('reference_type') url = reference.remove_field('url') if not url: url = file_path else: url = url.value extraction.used_result = SearchResult('', url) text = unicode('Reference %d from %s' % (index, file_path.rsplit('/', 1)[-1])) extraction.file_path = text extraction.entries.append(reference) extractions.append(extraction) extraction_gw.persist_extraction(extraction) log.info(''.join(['Imported ', text.lower()])) #@UndefinedVariable return extractions
def _do_portal_acm(self, source, page): """ Searches the page for a link to the reference, and then retrieves the reference. Returns a tuple with the full reference and its format. """ log.info('Using ACM Portal reference wrapper') #@UndefinedVariable ref = (None, None) anchor = page.find('a', {'onclick':re.compile('popBibTex.cfm')}) if not anchor: return ref jscript = anchor['onclick'].replace('window.open', '').strip('\(\)') ref_url = jscript.split(',')[0].strip('\'') ref_url = source + '/' + ref_url try: page = BeautifulSoup(self._browser.get_page(ref_url)) except BrowserError: log.error('Browse error while retrieving entry page') #@UndefinedVariable return ref pre = page.find('pre') if not pre: return ref # As the wrapper has been hardcoded, we already know what will be the # format of the reference return (pre.find(text=True).strip(), ReferenceFormat.BIBTEX)
def generate_wrappers(self, url): wrapper_manager = WrapperGateway() example_manager = ExampleGateway(max_examples=self.max_examples, max_examples_from_db= self.max_examples_from_db, seconds_between_requests= self.secs_between_reqs) example_sets = example_manager.get_examples(self.wrapper_gen_examples, url, self.min_validity) rulers = [] for set in example_sets: log.info('Starting wrapper training for set "%s"' % set) #@UndefinedVariable if set == 'author' or set == 'editor': rulers = [MultiValuePathRuler(), SeparatorsRegexRuler(), ElementsRegexRuler(), PersonRuler()] else: try: value_guide = self.value_guides[set] pass except KeyError: value_guide = '.*' rulers = [PathRuler(value_guide), RegexRuler()] trainer = WrapperTrainer(rulers, self.wrapper_gen_examples) try: wrappers = trainer.train(example_sets[set]) wrappers = self._prune_wrappers(wrappers) wrapper_manager.persist_wrappers(url, set, wrappers) log.info('Trainer generated %d wrappers' % len(wrappers)) #@UndefinedVariable except Exception, e: log.error('Error training wrapper for set "%s": %s' % (set, e)) #@UndefinedVariable
def initializePage(self): log.addHandler(self.guihandler) # @UndefinedVariable url = self.field("url").toPyObject() log.info("Starting training for URL: %s" % url) # @UndefinedVariable self.thread = WrapperTrainingThread(self, url) # Connect thread signals self.connect(self.thread, QtCore.SIGNAL("finished()"), self.finish) self.connect(self.thread, QtCore.SIGNAL("terminated()"), self.finish) self.thread.start()
def initializePage(self): log.addHandler(self.guihandler) #@UndefinedVariable url = self.field('url').toPyObject() log.info("Starting training for URL: %s" % url) #@UndefinedVariable self.thread = WrapperTrainingThread(self, url) # Connect thread signals self.connect(self.thread, QtCore.SIGNAL("finished()"), self.finish) self.connect(self.thread, QtCore.SIGNAL("terminated()"), self.finish) self.thread.start()
def make_reference(self, file, target_format): """ Uses the controllers to extract the content of a file, get some query strings, retrieve results from a search engine, and extract the reference. """ extraction = Extraction() extraction.file_path = file extraction.target_format = target_format log.info("Making reference for file: %s" % file) #@UndefinedVariable rce = RCEController(self.factory) raw_text = rce.extract_content(file, FileFormat.TXT) if not raw_text: return extraction extraction.query_strings = rce.get_query_strings(raw_text) if not extraction.query_strings: log.error('No query strings extracted') #@UndefinedVariable return extraction log.debug("Query strings %s" % str(extraction.query_strings)) #@UndefinedVariable ir = IRController(self.factory) extraction.top_results, extraction.used_query = ( ir.get_top_results(extraction.query_strings)) if not extraction.top_results: log.error('No top results to use with the available wrappers ' #@UndefinedVariable 'after trying %d queries' % len(extraction.query_strings)) return extraction extraction.query_strings.remove(extraction.used_query) log.debug("Used query %s" % str(extraction.used_query)) #@UndefinedVariable log.debug("Query returned %d top results" % len(extraction.top_results)) #@UndefinedVariable ie = IEController(self.factory, target_format) extraction.entries, extraction.used_result = ( ie.extract_reference(extraction.top_results, raw_text)) extraction.top_results.remove(extraction.used_result) log.info("Used result: %s" % str(extraction.used_result)) #@UndefinedVariable validator = ReferenceValidator(FIELD_WEIGHTS) for entry in extraction.entries: validator.validate(entry, raw_text) return extraction
def _do_citeseerx(self, source, page): """ Searches the page for a link to the reference, and then retrieves the reference. Returns a tuple with the full reference and its format. """ log.info('Using CiteSeerX reference wrapper') #@UndefinedVariable ref = (None, None) try: ref_element = page.find('div', {'class':'content'}, text=re.compile('@\w*{')) ref_element = ref_element.parent.findAll(text=True) reference = ''.join(ref_element) except Exception, e: log.warn('Could not find reference in citeseerx page: %s' % e) #@UndefinedVariable return ref
def run(self): log.debug("Start running index maker") #@UndefinedVariable # Run threads self.thread_runner = ThreadRunner(self.trhead_class, self._in_queue, self._out_queue) self.thread_runner.start() while not (self.thread_runner.finished and self._out_queue.empty()): extraction = self._out_queue.get() log.info('Persisting extraction results') #@UndefinedVariable # Persist the extraction ExtractionGateway().persist_extraction(extraction) self.processed.append(extraction) # Commit changes to the database flush_changes() log.debug("Total processed: %d" % len(self.processed)) #@UndefinedVariable
def _use_reference_wrappers(self, source, page, raw_text): """ Use a reference wrapper to get the reference from a given page. Returns a list of References with the full entry, format and a structure with the different fields. A single publication may need more than a reference (e.g: inproceedings and its proceedings) """ log.info('Attempting to extract reference with a reference wrapper' ) #@UndefinedVariable references = [] entry, format = ReferenceWrapper().extract_info(source, page) if not entry: log.debug('Could not find any entry using a reference wrapper' ) #@UndefinedVariable return references # Create a parser for the given reference format try: parser = self.util_factory.create_parser(format) except UtilCreationError as e: log.error('Could not create a parser for %s: %s' % ( format, #@UndefinedVariable e.args)) return references if not parser.check_format(entry): log.error('Given entry is not in %s' % format) #@UndefinedVariable return references # There may be more than one entry for the same file. log.debug('Parsing extracted entries') #@UndefinedVariable try: entries = parser.split_source(entry) for entry in entries: fields = parser.parse_entry(entry) reference = Reference(fields, format, entry) self._validate_reference_fields(reference, raw_text) references.append(reference) except Exception, e: log.error('Error parsing extracted entry: %s ' % e) #@UndefinedVariable
def get_top_results(self, query_strings, engine=ENGINE): """ Returns a list of search results. """ results = [] # Get a searcher try: searcher = self.util_factory.create_searcher(engine) except UtilCreationError as e: log.error('Could not create a searcher: %s' % e.args) #@UndefinedVariable return results # Search the query strings for query in query_strings: searcher.set_query(query) try: log.debug('Searching query %s' % (query)) #@UndefinedVariable results = searcher.get_results() except SearchError, e: log.error(e.error) #@UndefinedVariable break if searcher.num_results >= self.too_many_results: log.debug( 'Search with query %s yielded too many results ' #@UndefinedVariable '(%d or more)' % (query, self.too_many_results)) results = [] continue if results: log.info( 'Searcher yielded the following results using ' #@UndefinedVariable 'query %s' % (query)) for result in results: log.info(' %s' % result.url[:120]) #@UndefinedVariable results = self._sort_results(results) if results: break
def extract_reference(self, top_results, raw_text): """ Returns a list of References if they can be extracted or an empty list otherwise. A single publication may need more than a reference (e.g: inproceedings and its proceedings) """ log.info('Using %d top results' % len(top_results)) #@UndefinedVariable page = None references = [] for result in top_results: try: log.debug('Retrieving page for result %s' % result.url) #@UndefinedVariable page = self.browser.get_page(result.url) except BrowserError as e: log.error('Error retrieving page %s: %s' % ( result.url, #@UndefinedVariable e.error)) continue page = ContentCleaner().clean_content(page) references = self._use_reference_wrappers(result.base_url, page, raw_text) if not references: references = self._use_rule_wrappers(result.base_url, page, raw_text) if references: break # Convert to target format, if necessary for reference in references: self._format_reference(reference) # Return the extracted reference and the result that has been used return (references, result)
def generate_wrappers(self, url): wrapper_manager = WrapperGateway() example_manager = ExampleGateway( max_examples=self.max_examples, max_examples_from_db=self.max_examples_from_db, seconds_between_requests=self.secs_between_reqs) example_sets = example_manager.get_examples(self.wrapper_gen_examples, url, self.min_validity) rulers = [] for set in example_sets: log.info('Starting wrapper training for set "%s"' % set) #@UndefinedVariable if set == 'author' or set == 'editor': rulers = [ MultiValuePathRuler(), SeparatorsRegexRuler(), ElementsRegexRuler(), PersonRuler() ] else: try: value_guide = self.value_guides[set] pass except KeyError: value_guide = '.*' rulers = [PathRuler(value_guide), RegexRuler()] trainer = WrapperTrainer(rulers, self.wrapper_gen_examples) try: wrappers = trainer.train(example_sets[set]) wrappers = self._prune_wrappers(wrappers) wrapper_manager.persist_wrappers(url, set, wrappers) log.info('Trainer generated %d wrappers' % len(wrappers)) #@UndefinedVariable except Exception, e: log.error('Error training wrapper for set "%s": %s' % (set, e)) #@UndefinedVariable
def _use_reference_wrappers(self, source, page, raw_text): """ Use a reference wrapper to get the reference from a given page. Returns a list of References with the full entry, format and a structure with the different fields. A single publication may need more than a reference (e.g: inproceedings and its proceedings) """ log.info('Attempting to extract reference with a reference wrapper') #@UndefinedVariable references = [] entry, format = ReferenceWrapper().extract_info(source, page) if not entry: log.debug('Could not find any entry using a reference wrapper') #@UndefinedVariable return references # Create a parser for the given reference format try: parser = self.util_factory.create_parser(format) except UtilCreationError as e: log.error('Could not create a parser for %s: %s' % (format, #@UndefinedVariable e.args)) return references if not parser.check_format(entry): log.error('Given entry is not in %s' % format) #@UndefinedVariable return references # There may be more than one entry for the same file. log.debug('Parsing extracted entries') #@UndefinedVariable try: entries = parser.split_source(entry) for entry in entries: fields = parser.parse_entry(entry) reference = Reference(fields, format, entry) self._validate_reference_fields(reference, raw_text) references.append(reference) except Exception, e: log.error('Error parsing extracted entry: %s ' % e) #@UndefinedVariable
def get_top_results(self, query_strings, engine=ENGINE): """ Returns a list of search results. """ results = [] # Get a searcher try: searcher = self.util_factory.create_searcher(engine) except UtilCreationError as e: log.error('Could not create a searcher: %s' % e.args) #@UndefinedVariable return results # Search the query strings for query in query_strings: searcher.set_query(query) try: log.debug('Searching query %s' % (query)) #@UndefinedVariable results = searcher.get_results() except SearchError, e: log.error(e.error) #@UndefinedVariable break if searcher.num_results >= self.too_many_results: log.debug('Search with query %s yielded too many results ' #@UndefinedVariable '(%d or more)' % (query, self.too_many_results)) results = [] continue if results: log.info('Searcher yielded the following results using ' #@UndefinedVariable 'query %s' % (query)) for result in results: log.info(' %s' % result.url[:120]) #@UndefinedVariable results = self._sort_results(results) if results: break
def extract_reference(self, top_results, raw_text): """ Returns a list of References if they can be extracted or an empty list otherwise. A single publication may need more than a reference (e.g: inproceedings and its proceedings) """ log.info('Using %d top results' % len(top_results)) #@UndefinedVariable page = None references = [] for result in top_results: try: log.debug('Retrieving page for result %s' % result.url) #@UndefinedVariable page = self.browser.get_page(result.url) except BrowserError as e: log.error('Error retrieving page %s: %s' % (result.url, #@UndefinedVariable e.error)) continue page = ContentCleaner().clean_content(page) references = self._use_reference_wrappers(result.base_url, page, raw_text) if not references: references = self._use_rule_wrappers(result.base_url, page, raw_text) if references: break # Convert to target format, if necessary for reference in references: self._format_reference(reference) # Return the extracted reference and the result that has been used return (references, result)
def _use_rule_wrappers(self, source, page, raw_text): """ Look if there is any wrapper in the database for the given source. """ log.info('Attempting to extract reference with ruled wrappers') #@UndefinedVariable fields = {} reference = Reference() wrapper_manager = WrapperGateway(max_wrappers=self.max_wrappers) wrapper_field_collections = wrapper_manager.find_wrapper_collections(source) for collection in wrapper_field_collections: # Get the wrappers for the current collection url, field = collection.url, collection.field wrappers = wrapper_manager.get_wrappers(url, field) log.debug('Collection %s:%s has %d wrappers' % (url, field, #@UndefinedVariable len(wrappers))) # Get field validator try: validator = self.field_validation[collection.field][1] except KeyError: validator = None # Extract information using the wrappers we have for wrapper in wrappers: info = wrapper.extract_info(page) # we expect 'info' to be a string if type(info) == list and not (collection.field == 'author' or collection.field == 'editor'): continue log.debug('Info extracted by wrapper: %s' % info) #@UndefinedVariable valid = validator.validate(info, raw_text) if validator else True # Save the extracted info even if it's not correct. It will # be overwritten afterwards if necessary reference.set_field(field, info, valid) if not valid: log.debug('The extracted information is not valid. ' #@UndefinedVariable 'Downvoting wrapper.') wrapper.downvotes += 1 wrapper_manager.update_wrapper(wrapper) else: log.debug('The extracted information is valid. ' #@UndefinedVariable 'Upvoting wrapper') wrapper.upvotes += 1 wrapper_manager.update_wrapper(wrapper) fields[field] = info break if len(reference.fields) > 0: log.info('Extracted reference') #@UndefinedVariable return [reference] else: log.info('Could not extract reference using ruled wrappers') #@UndefinedVariable return []
def _check_still_valid(self, mapper, content, min_validity): """ It checks if the information to be extracted is really present within the contents. If it doesn't, then it updates the database so the corresponding records won't be used again. """ # In case the content could not be extracted, don't update the database if not content: return False # For each piece of information, check if it exists in the contents. # At this point, we don't care about its location, but if it # can be found. not_found = 0.0 for field in mapper.fields: found = content.find(text=re.compile(re.escape(field.value))) if not found: log.info('Field %s with value %s cannot be found anymore in %d' #@UndefinedVariable % (field.name, field.value, mapper.id)) field.valid = False not_found += 1 # Recompute validity if len(mapper.fields): validity = 1 - (not_found / len(mapper.fields)) else: validity = 1 if validity < min_validity: log.info('Reference "%d" marked as invalid from now on.' % #@UndefinedVariable mapper.id) mapper.validity = validity return False return True
def finish(self): self.progressBar.setMaximum(1) self.progressBar.setValue(1) log.info('Finished importing. Results can be found in the Manage page') #@UndefinedVariable log.removeHandler(self.guihandler) #@UndefinedVariable self.empty_edit.setText('Done!')
def import_references(self, path): log.info('Importing references from %s' % path) #@UndefinedVariable references = self.ref_controller.persist_file_references(path) return len(references)
self.parser = self.util_factory.create_parser(self.format) except UtilCreationError, e: log.error('Error creating parser for format %s: %s' % #@UndefinedVariable (str(self.format), str(e))) return references try: file = open(file_path, 'r') content = file.read() except Exception, e: log.error('Error reading entries file %s: %s' % #@UndefinedVariable (file_path, str(e))) return references if not content: log.info('Empty entries file') #@UndefinedVariable return references if not self.parser.check_format(content): log.error('Given entry is not in %s' % format) #@UndefinedVariable return references # There may be more than one entry for the same file. log.debug('Parsing entries') #@UndefinedVariable entries = self.parser.split_source(content) for entry in entries: fields = self.parser.parse_entry(entry) reference = Reference(fields, format, entry) reference.validity = 1.0
def _use_rule_wrappers(self, source, page, raw_text): """ Look if there is any wrapper in the database for the given source. """ log.info('Attempting to extract reference with ruled wrappers' ) #@UndefinedVariable fields = {} reference = Reference() wrapper_manager = WrapperGateway(max_wrappers=self.max_wrappers) wrapper_field_collections = wrapper_manager.find_wrapper_collections( source) for collection in wrapper_field_collections: # Get the wrappers for the current collection url, field = collection.url, collection.field wrappers = wrapper_manager.get_wrappers(url, field) log.debug('Collection %s:%s has %d wrappers' % ( url, field, #@UndefinedVariable len(wrappers))) # Get field validator try: validator = self.field_validation[collection.field][1] except KeyError: validator = None # Extract information using the wrappers we have for wrapper in wrappers: info = wrapper.extract_info(page) # we expect 'info' to be a string if type(info) == list and not (collection.field == 'author' or collection.field == 'editor'): continue log.debug('Info extracted by wrapper: %s' % info) #@UndefinedVariable valid = validator.validate(info, raw_text) if validator else True # Save the extracted info even if it's not correct. It will # be overwritten afterwards if necessary reference.set_field(field, info, valid) if not valid: log.debug( 'The extracted information is not valid. ' #@UndefinedVariable 'Downvoting wrapper.') wrapper.downvotes += 1 wrapper_manager.update_wrapper(wrapper) else: log.debug( 'The extracted information is valid. ' #@UndefinedVariable 'Upvoting wrapper') wrapper.upvotes += 1 wrapper_manager.update_wrapper(wrapper) fields[field] = info break if len(reference.fields) > 0: log.info('Extracted reference') #@UndefinedVariable return [reference] else: log.info('Could not extract reference using ruled wrappers' ) #@UndefinedVariable return []
log.error('Error creating parser for format %s: %s' % #@UndefinedVariable (str(self.format), str(e))) return references try: file = open(file_path, 'r') content = file.read() except Exception, e: log.error( 'Error reading entries file %s: %s' % #@UndefinedVariable (file_path, str(e))) return references if not content: log.info('Empty entries file') #@UndefinedVariable return references if not self.parser.check_format(content): log.error('Given entry is not in %s' % format) #@UndefinedVariable return references # There may be more than one entry for the same file. log.debug('Parsing entries') #@UndefinedVariable entries = self.parser.split_source(content) for entry in entries: fields = self.parser.parse_entry(entry) reference = Reference(fields, format, entry) reference.validity = 1.0 references.append(reference)