Exemplo n.º 1
0
    def persist_file_references(self, file_path):
        """
        Parses references from a file and stores them to the database
        """
        extraction_gw = ExtractionGateway()
        references = self._parse_entries_file(file_path)
        extractions = []

        for reference, index in zip(references, range(len(references))):

            extraction = Extraction()

            # Clean fields that we don't want
            reference.remove_field('reference_id')
            reference.remove_field('abstract')
            reference.remove_field('reference_type')

            url = reference.remove_field('url')
            if not url:
                url = file_path
            else:
                url = url.value

            extraction.used_result = SearchResult('', url)
            text = unicode('Reference %d from %s' %
                           (index, file_path.rsplit('/', 1)[-1]))
            extraction.file_path = text
            extraction.entries.append(reference)
            extractions.append(extraction)
            extraction_gw.persist_extraction(extraction)
            log.info(''.join(['Imported ', text.lower()]))  #@UndefinedVariable

        return extractions
Exemplo n.º 2
0
 def persist_file_references(self, file_path):
     """
     Parses references from a file and stores them to the database
     """
     extraction_gw = ExtractionGateway()
     references = self._parse_entries_file(file_path)
     extractions = []
     
     for reference, index in zip(references, range(len(references))):
         
         extraction = Extraction()
         
         # Clean fields that we don't want
         reference.remove_field('reference_id')
         reference.remove_field('abstract')
         reference.remove_field('reference_type')
         
         url = reference.remove_field('url')
         if not url:
             url = file_path
         else:
             url = url.value
         
         extraction.used_result = SearchResult('', url)
         text = unicode('Reference %d from %s' % (index,
                             file_path.rsplit('/', 1)[-1]))
         extraction.file_path = text
         extraction.entries.append(reference)
         extractions.append(extraction)
         extraction_gw.persist_extraction(extraction)
         log.info(''.join(['Imported ', text.lower()])) #@UndefinedVariable
     
     return extractions
Exemplo n.º 3
0
 def find_extraction_by_id(self, e_id):
     m_extraction = self.session.query(mappers.Extraction).filter_by(id=e_id).one()
     extraction = Extraction()
     
     extraction.id = m_extraction.id
     extraction.used_query = m_extraction.query_string
     extraction.used_result = SearchResult("", m_extraction.result)
     return extraction
Exemplo n.º 4
0
 def run(self):
     """
     Runs indefinitely until it is asked to finish.
     Processes files from the 'input_queue' and supplies them to a 
     'ReferenceMaker' object.
     Once the ReferenceMaker is done, it stores the results in tuples
     (file, reference) to the output queue.
     """
     log.debug("Running thread",
               extra={'threadname': self.getName()})  #@UndefinedVariable
     while not self.stop_event.isSet():
         file = None
         if not self.in_queue.empty():
             try:
                 file = self.in_queue.get(False)
             except Queue.Empty:
                 continue
         if file:
             log.debug("Processing file %s" % file)  #@UndefinedVariable
             try:
                 reference = ReferenceMaker().make_reference(
                     file, self.target_format)
                 self.out_queue.put(reference)
             except Exception, e:
                 log.error(
                     'Unexpected exception while extracting reference'  #@UndefinedVariable
                     ' for file %s: %s' % (file, str(e)))
                 self.out_queue.put(Extraction())
                 continue
Exemplo n.º 5
0
    def make_reference(self, file, target_format):
        """
        Uses the controllers to extract the content of a file, get some query
        strings, retrieve results from a search engine, and extract the
        reference.
        """
        extraction = Extraction()
        
        extraction.file_path = file
        extraction.target_format = target_format
        
        log.info("Making reference for file: %s" % file) #@UndefinedVariable

        rce = RCEController(self.factory)
        raw_text = rce.extract_content(file, FileFormat.TXT)
        if not raw_text:
            return extraction
        
        extraction.query_strings = rce.get_query_strings(raw_text)
        if not extraction.query_strings:
            log.error('No query strings extracted') #@UndefinedVariable
            return extraction
        log.debug("Query strings %s" % str(extraction.query_strings)) #@UndefinedVariable
        
        ir = IRController(self.factory)
        extraction.top_results, extraction.used_query = (
            ir.get_top_results(extraction.query_strings))
        if not extraction.top_results:
            log.error('No top results to use with the available wrappers ' #@UndefinedVariable
                      'after trying %d queries' % 
                      len(extraction.query_strings))
            return extraction
        extraction.query_strings.remove(extraction.used_query)
        
        log.debug("Used query %s" % str(extraction.used_query)) #@UndefinedVariable
        log.debug("Query returned %d top results" % len(extraction.top_results)) #@UndefinedVariable
        
        ie = IEController(self.factory, target_format)
        extraction.entries, extraction.used_result = (
            ie.extract_reference(extraction.top_results, raw_text))
        extraction.top_results.remove(extraction.used_result)
        log.info("Used result: %s" % str(extraction.used_result)) #@UndefinedVariable
        
        validator = ReferenceValidator(FIELD_WEIGHTS)
        for entry in extraction.entries:
            validator.validate(entry, raw_text)
        
        return extraction