def read_bibcode_file(self, bibcode_file_path):
     """ Function that read the list of bibcodes in one file:
         The bibcodes must be at the beginning of a row.
     """
     printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3]))
     printmsg(self.verbose, "Reading %s \n" % bibcode_file_path)
     try:
         bibfile = open(bibcode_file_path, "rU")
     except IOError:
         sys.stdout.write("Input file not readable \n")
         raise GenericError('Mandatory file not readable. Please check %s \n' % bibcode_file_path)
     
     bibcodes_list = []
     
     for bibrow in bibfile:
         if bibrow[0] != " ":
             bibrow_elements =  bibrow.split('\t')
             bibcode = bibrow_elements[0].rstrip('\n')
             if bibcode != '':
                 bibcodes_list.append(bibcode)
     
     bibfile.close()
     del bibfile
     #return the list of bibcodes        
     return bibcodes_list
 def rem_bibs_to_extr_del(self, extraction_dir):
     """method that finds the bibcodes to extract and to delete not processed in an extraction """
     printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3]))
     #first I extract the list of bibcodes that I had to extract
     bibcodes_to_extract = self.read_bibcode_file(os.path.join(extraction_dir, settings.BASE_FILES['new']))
     #then the ones I had to delete
     bibcodes_to_delete = self.read_bibcode_file(os.path.join(extraction_dir, settings.BASE_FILES['del']))
     #then the ones that had problems during the extraction
     bibcodes_probl = self.read_bibcode_file(os.path.join(extraction_dir, settings.BASE_FILES['prob']))
     #finally the ones that have been extracted correctly
     bibcodes_done = self.read_bibcode_file(os.path.join(extraction_dir, settings.BASE_FILES['done']))
     
     bibcode_processed = list(set(bibcodes_probl).union(set(bibcodes_done)))
     #then I find the ones remaining to extract
     bibcodes_to_extract_remaining = list(set(bibcodes_to_extract) - set(bibcode_processed))
     #then I find the ones remaining to delete
     bibcodes_to_delete_remaining = list(set(bibcodes_to_delete) - set(bibcode_processed))
     
     #now I want the list of extraction  ordered with first the preprint and then the other bibcodes
     #only if I have something remaining
     if len(bibcodes_to_extract_remaining) > 0:
         #I load the saved preprint file 
         bibcodes_preprint =  self.read_bibcode_file(os.path.join(settings.BASE_OUTPUT_PATH, extraction_dir, 'PRE_'+os.path.basename(settings.BIBCODES_PRE)))
         remaining_preprint = list(set(bibcodes_to_extract_remaining).intersection(set(bibcodes_preprint)))
         remaining_preprint.sort()
         other_remaining = list(set(bibcodes_to_extract_remaining) - set(remaining_preprint))
         other_remaining.sort()
         bibcodes_to_extract_remaining =  remaining_preprint + other_remaining
     
     return (bibcodes_to_extract_remaining, bibcodes_to_delete_remaining)
def problematic_extraction_process(q_probl, num_active_workers, lock_stdout, q_life, extraction_directory, verbose):
    """Worker that takes care of the bibcodes that couldn't be extracted and writes them to the related file"""

    while True:
        group_probl = q_probl.get()

        # first of all I check if the group I'm getting is a message from a process that finished
        if group_probl[0] == "WORKER DONE":
            num_active_workers = num_active_workers - 1
            # if there are no active worker any more, I'm done with processing output
            if num_active_workers == 0:
                break
        else:
            # otherwise I process the output:
            # I puth the bibcodes in the file of the problematic bibcodes
            if len(group_probl[1]) > 0:
                w2f = write_files.WriteFile(extraction_directory, verbose)
                w2f.write_problem_bibcodes_to_file(group_probl[1])

                lock_stdout.acquire()
                printmsg(
                    True,
                    multiprocessing.current_process().name
                    + (" (problematic bibcodes worker) wrote problematic bibcodes for group %s \n" % group_probl[0]),
                )
                lock_stdout.release()

    # I tell the manager that I'm done and I'm exiting
    q_life.put(["PROBLEMBIBS DONE"])

    lock_stdout.acquire()
    printmsg(True, multiprocessing.current_process().name + " (problematic bibcodes worker) job finished: exiting \n")
    lock_stdout.release()
 def init_stylesheet(self):
     """ Method that initialize the transformation engine """
     printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3]))
     #create the stylesheet obj
     try:
         self.style_obj = libxslt.parseStylesheetDoc(libxml2.parseFile(self.stylesheet))
     except:
         raise GenericError("ERROR: problem loading stylesheet \n")
     
     return True
 def get_all_bibcodes(self):
     """Method that retrieves the complete list of bibcodes"""
     printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3]))
     # Timestamps ordered by increasing order of importance.
     timestamp_files_hierarchy = [settings.BIBCODES_GEN, settings.BIBCODES_PRE, settings.BIBCODES_PHY, settings.BIBCODES_AST ]
     
     bibcodes = set([])
     for filename in timestamp_files_hierarchy:
         db_bibcodes = self.read_bibcode_file(filename)
         bibcodes = bibcodes.union(set(db_bibcodes))
     bibcodes_list = list(bibcodes)
     bibcodes_list.sort()
     return bibcodes_list
    def process_bibcodes_to_delete(self):
        """method that creates the MarcXML for the bibcodes to delete"""
        printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3]))

        # I create an unique file for all the bibcodes to delete:
        # I don't think it's necessary to split the content in groups, since the XML is really simple

        # I create the base object for the tree
        doc = libxml2.newDoc("1.0")
        root = doc.newChild(None, "collection", None)

        # then for each bibcode to delete I create the proper record
        for bibcode in self.bibcodes_to_delete_list:
            record = root.newChild(None, "record", None)
            # I add to the record the 2 necessary datafields
            d970 = record.newChild(None, "datafield", None)
            d970.setProp("tag", "970")
            d970.setProp("ind1", "")
            d970.setProp("ind1", "")
            # I create the subfield tag
            sub = d970.newChild(None, "subfield", bibcode.replace("&", "&"))
            sub.setProp("code", "a")
            d980 = record.newChild(None, "datafield", None)
            d980.setProp("tag", "980")
            d980.setProp("ind1", "")
            d980.setProp("ind1", "")
            # I create the subfield tag
            sub = d980.newChild(None, "subfield", "DELETED")
            sub.setProp("code", "c")

        # I extract the node
        marcxml_string = doc.serialize("UTF-8", 2)
        # I remove the data
        doc.freeDoc()
        del doc

        # I write to the file
        w2f = write_files.WriteFile(self.extraction_directory, self.verbose)
        filename_delete = w2f.write_bibcodes_to_delete_file(
            marcxml_string, self.bibcodes_to_delete_list, self.extraction_name
        )

        if filename_delete:
            printmsg(
                self.verbose,
                "The MarcXML for the bibcode to delete has been written to the file %s \n" % filename_delete,
            )
        else:
            raise GenericError("Impossible to create the file for the MarcXML of the bibcodes to delete")

        return True
 def extr_diff_bibs_from_extraction(self, extraction_dir):
     """method that extracts the list of bibcodes not processed from a directory used for an extraction"""
     printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3])) 
     #first I extract the list of bibcodes that I had to extract
     bibcodes_to_extract = self.read_bibcode_file(os.path.join(extraction_dir, settings.BASE_FILES['new']))
     #then the ones I had to delete
     bibcodes_to_delete = self.read_bibcode_file(os.path.join(extraction_dir, settings.BASE_FILES['del']))
     #then the ones that had problems during the extraction
     bibcodes_probl = self.read_bibcode_file(os.path.join(extraction_dir, settings.BASE_FILES['prob']))
     #finally the ones that have been extracted correctly
     bibcodes_done = self.read_bibcode_file(os.path.join(extraction_dir, settings.BASE_FILES['done']))
     #then I extract the ones remaining
     bibcodes_remaining = list((set(bibcodes_to_extract).union(set(bibcodes_to_delete))) - (set(bibcodes_probl).union(set(bibcodes_done))))
     return bibcodes_remaining
Exemplo n.º 8
0
    def write_done_bibcodes_to_file(self, bibcodes_list):
        """Method that writes a list of bibcodes in the file of the done bibcodes"""
        printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3]))
        
        filepath = os.path.join(settings.BASE_OUTPUT_PATH, self.dirname, settings.BASE_FILES['done'])
        
        try:
            file_obj = open(filepath, 'a')
            for bibcode in bibcodes_list:
                file_obj.write(bibcode+'\n')
            file_obj.close()
        except:
            raise GenericError('Impossible to write in the "bibcode done file" %s \n' % filepath)

        return True
    def manage(self):
        """public function"""
        printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3])) 

        #If there is a wrong mode, I will raise an exception
        if self.mode != 'full' and self.mode != 'update':
            raise GenericError('Wrong parameter: the extraction can be only full or update')
        #otherwise I proceed
        else:
            #retrieve the list of bibcode to extract and the list of bibcodes to delete
            (bibcodes_to_extract_list, bibcodes_to_delete_list) = self.retrieve_bibcodes_to_extract()
            #call the extractor manager
            are = ads_record_extractor.ADSRecordExtractor(bibcodes_to_extract_list, bibcodes_to_delete_list, self.dirname, self.verbose)
            del bibcodes_to_extract_list
            del bibcodes_to_delete_list
            are.extract()
            
            return
Exemplo n.º 10
0
    def write_marcxml_file(self, xmlstring, taskname, extraction_name):
        """method that writes the marcXML to a file naming it in the proper way"""
        printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3]))
        
        filename = settings.MARCXML_FILE_BASE_NAME + '_' + extraction_name + '_'+ taskname + '.xml'
        filepath = os.path.join(settings.BASE_OUTPUT_PATH, self.dirname, filename)
        
        printmsg(self.verbose, "Writing the MarcXML file %s \n" % filepath) 
        #then I actually write the file
        try:
            file_obj = open(filepath,'w')
            file_obj.write(xmlstring)
            file_obj.close()
        except:
            return False
        
        del file_obj, xmlstring

        return filepath
Exemplo n.º 11
0
 def transform(self, doc):
     """ Method that actually make the transformation"""
     printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3]))
     
     #I load the stylesheet
     self.init_stylesheet()
             
     #transformation
     try:
         doc = self.style_obj.applyStylesheet(doc, None)
     except:
         printmsg(True, "ERROR: Transformation failed \n") 
         return False
     
     #to string
     result = self.style_obj.saveResultToString(doc)
     
     #self.styleObj.freeStylesheet()
     doc.freeDoc()
     
     return result
Exemplo n.º 12
0
 def retrieve_bibcodes_to_extract(self):
     """method that retrieves the bibcodes that need to be extracted from ADS"""
     printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3]))
     
     #check the status of the last extraction
     status_last_extraction = self.check_last_extraction()
     
     if status_last_extraction == 'OK' or status_last_extraction == 'NOTHING FOUND' or status_last_extraction == 'NOT VALID DIRECTORY CONTENT':
         printmsg(self.verbose, "Last extraction was fine: proceeding with a new one \n")
         #I create directory and files of bibcodes to extract
         self.dirname = strftime("%Y_%m_%d-%H_%M_%S")
         os.mkdir(os.path.join(settings.BASE_OUTPUT_PATH, self.dirname), 0755)
         for filetype in settings.BASE_FILES:
             fileobj = open(os.path.join(os.path.join(settings.BASE_OUTPUT_PATH, self.dirname), settings.BASE_FILES[filetype]),'w')
             fileobj.write('')
             fileobj.close()
         # I write also the file to log the extraction name
         fileobj = open(os.path.join(os.path.join(settings.BASE_OUTPUT_PATH, self.dirname), settings.EXTRACTION_FILENAME_LOG),'w')
         fileobj.write('')
         fileobj.close()
         del fileobj
         #then I extract the list of bibcodes according to "mode"
         if self.mode == 'full':
             #if node == full I have to extrat all the bibcodes
             return self.extract_full_list_of_bibcodes()
         elif self.mode == 'update':
             return self.extract_update_list_of_bibcodes()
     else:
         printmsg(self.verbose, "Last extraction was not fine: recovering \n")
         #I retrieve the bibcodes missing from the last extraction
         self.dirname = self.lastest_extr_dir
         return self.rem_bibs_to_extr_del(os.path.join(settings.BASE_OUTPUT_PATH, self.lastest_extr_dir))
Exemplo n.º 13
0
 def write_bibcodes_to_delete_file(self, xmlstring, bibcodes_list, extraction_name):
     """method that writes the file with the bibcodes to delete and updates the file with the done bibcodes"""
     printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3]))
     
     #I build the complete path and filename for the file to extract
     filename = settings.BIBCODE_TO_DELETE_OUT_NAME + '_'+ extraction_name + '.xml'
     filepath = os.path.join(settings.BASE_OUTPUT_PATH, self.dirname, filename)
     
     printmsg(self.verbose, "Writing the MarcXML file %s \n" % filepath) 
     #then I actually write the file
     try:
         file_obj = open(filepath,'w')
         file_obj.write(xmlstring)
         file_obj.close()
     except:
         return False
     
     del file_obj, xmlstring
     
     #then I append the list of bibcodes actually written extracte to the "done file"
     bibdone_filename = os.path.join(settings.BASE_OUTPUT_PATH, self.dirname, settings.BASE_FILES['done'])
     printmsg(self.verbose, 'Updating the "processed bibcodes" file %s \n' % bibdone_filename) 
     try:
         file_obj = open(bibdone_filename, 'a')
         for bibcode in bibcodes_list:
             file_obj.write(bibcode+'\n')
         file_obj.close()
     except:
         raise GenericError('Impossible to write in the "bibcode done file" %s \n' % bibdone_filename)
     
     del file_obj, bibcodes_list
     
     return filepath
    def extract(self):
        """manager of the extraction"""
        printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3]))

        ########################################################################
        # part where the bibcode to delete are processed

        # I have to upload first the bibcodes to delete and then the others.
        # So I process them first
        if self.bibcodes_to_delete_list:
            try:
                self.process_bibcodes_to_delete()
            except Exception:
                printmsg(True, "Unable to process the bibcodes to delete \n")
                raise GenericError("Unable to process the bibcodes to delete \n")

        ########################################################################
        # part where the bibcode to extract (new or update) are processed

        # I split the list of bibcodes to process in multiple groups
        bibtoprocess_splitted = self.grouper(settings.NUMBER_OF_BIBCODES_PER_GROUP, self.bibcodes_to_extract_list)

        # I define a manager for the workers
        manager = multiprocessing.Process(
            target=extractor_manager_process,
            args=(bibtoprocess_splitted, self.extraction_directory, self.extraction_name, self.verbose),
        )
        # I start the process
        manager.start()
        # I join the process
        manager.join()

        printmsg(True, "Extraction ended! \n")
def done_extraction_process(q_done, num_active_workers, lock_stdout, q_life, extraction_directory, verbose):
    """Worker that takes care of the groups of bibcodes processed and writes the bibcodes to the related file
        NOTE: this can be also the process that submiths the upload processes to invenio
    """
    while True:
        group_done = q_done.get()

        # first of all I check if the group I'm getting is a message from a process that finished
        if group_done[0] == "WORKER DONE":
            num_active_workers = num_active_workers - 1
            # if there are no active worker any more, I'm done with processing output
            if num_active_workers == 0:
                break
        else:
            # otherwise I process the output:
            # I puth the bibcodes in the file of the done bibcodes
            if len(group_done[1]) > 0:
                w2f = write_files.WriteFile(extraction_directory, verbose)
                w2f.write_done_bibcodes_to_file(group_done[1])

                lock_stdout.acquire()
                printmsg(
                    True,
                    multiprocessing.current_process().name
                    + (" (done bibcodes worker) wrote done bibcodes for group %s \n" % group_done[0]),
                )
                lock_stdout.release()

            # I call the procedure to submit to invenio the process to upload the file
            filename_path = group_done[2]

            # invenio.bibtask.task_low_level_submission http://bit.ly/nnQZbs

    # I tell the manager that I'm done and I'm exiting
    q_life.put(["DONEBIBS DONE"])

    lock_stdout.acquire()
    printmsg(True, multiprocessing.current_process().name + " (done bibcodes worker) job finished: exiting \n")
    lock_stdout.release()
Exemplo n.º 16
0
 def extract_full_list_of_bibcodes(self):
     """ method that extracts the complete list of bibcodes
         it first extracts the list of arxiv bibcodes and then all the others
     """
     printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3]))
         
     #first I extract the list of preprint
     preprint_bibcodes = self.read_bibcode_file(settings.BIBCODES_PRE)
     #I copy the preprint file, because I need a copy locally
     try:
         shutil.copy(settings.BIBCODES_PRE, os.path.join(settings.BASE_OUTPUT_PATH, self.dirname, 'PRE_'+os.path.basename(settings.BIBCODES_PRE)))
     except:
         raise GenericError('Impossible to copy a mandatory file from %s to %s' % (settings.BIBCODES_PRE, os.path.join(settings.BASE_OUTPUT_PATH, self.dirname)))
     #then I extract the complete list
     #all_bibcodes = self.read_bibcode_file(settings.BIBCODES_ALL)
     all_bibcodes = self.get_all_bibcodes()
     not_pre_bibcodes = list(set(all_bibcodes) - set(preprint_bibcodes))
     not_pre_bibcodes.sort()
     
     #I write these lists bibcodes to the file of bibcodes to extract
     #and in the meanwhile I create the list with first the preprint and then the published
     bibcode_file = open(os.path.join(settings.BASE_OUTPUT_PATH, self.dirname, settings.BASE_FILES['new']), 'a')
     bibcode_to_extract = []
     #first the preprints because they can be overwritten by the published ones
     for bibcode in preprint_bibcodes:
         bibcode_file.write(bibcode + '\n')
         bibcode_to_extract.append(bibcode)
     #then all the other bibcodes
     for bibcode in not_pre_bibcodes:
         bibcode_file.write(bibcode + '\n')
         bibcode_to_extract.append(bibcode)
     bibcode_file.close()
     del bibcode
     del bibcode_file
     
     printmsg(self.verbose, "Full list of bibcodes and related file generated \n")
     #finally I return the full list of bibcodes and an empty list for the bibcodes to delete
     return (bibcode_to_extract, [])
    def set_extraction_name(self):
        """Method that sets the name of the current extraction"""
        printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3]))

        filepath = os.path.join(settings.BASE_OUTPUT_PATH, self.extraction_directory, settings.EXTRACTION_FILENAME_LOG)
        file_obj = open(filepath, "r")
        rows = file_obj.readlines()
        file_obj.close()
        if len(rows) > 0:
            last_name = rows[len(rows) - 1]
            number_ext = int(last_name.split(settings.EXTRACTION_BASE_NAME)[1])
            number_ext = number_ext + 1
        else:
            last_name = None
            number_ext = 1

        extraction_name = settings.EXTRACTION_BASE_NAME + str(number_ext)
        # Then I write the number of extraction to the file
        file_obj = open(filepath, "a")
        file_obj.write(extraction_name + "\n")
        file_obj.close()

        return extraction_name
Exemplo n.º 18
0
 def extract_update_list_of_bibcodes(self):
     """Method that extracts the list of bibcodes to update"""
     printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3]))
     
     records_added, records_modified, records_deleted = timestamp_manager.get_records_status(self.verbose)
     
     bibcodes_to_extract = list(records_added) + list(records_modified)
     bibcodes_to_extract.sort()
     bibcodes_to_delete = list(records_deleted)
     bibcodes_to_delete.sort()
     #then I write all these bibcodes to the proper files
     #first the one to extract
     bibcode_file = open(os.path.join(settings.BASE_OUTPUT_PATH, self.dirname, settings.BASE_FILES['new']), 'a')
     for bibcode in bibcodes_to_extract:
         bibcode_file.write(bibcode + '\n')
     bibcode_file.close()
     #then the one to delete
     bibcode_file = open(os.path.join(settings.BASE_OUTPUT_PATH, self.dirname, settings.BASE_FILES['del']), 'a')
     for bibcode in bibcodes_to_delete:
         bibcode_file.write(bibcode + '\n')
     bibcode_file.close()
     
     #I return the list of bibcodes to extract and the list of bibcodes to delete
     return (bibcodes_to_extract, bibcodes_to_delete)
def get_records_status(verbose=False):
    """
    Return 3 sets of bibcodes:
    * bibcodes added are bibcodes that are in ADS and not in Invenio.
    * bibcodes modified are bibcodes that are both in ADS and in Invenio and
      that have been modified since the last update.
    * bibcodes deleted are bibcodes that are in Invenio but not in ADS.
    """
    records_added = []
    records_modified = []
    records_deleted = []

    printmsg(verbose, "Getting ADS timestamps. \n")
    ads_timestamps = _get_ads_timestamps()
    printmsg(verbose, "Getting ADS bibcodes. \n")
    ads_bibcodes = set(ads_timestamps.keys())
    printmsg(verbose, "Getting Invenio bibcodes. \n")
    invenio_bibcodes = _get_invenio_bibcodes()

    printmsg(verbose, "Deducting the added records. \n")
    records_added = ads_bibcodes - invenio_bibcodes
    printmsg(verbose, "    %d records to add." % len(records_added))
    printmsg(verbose, "Deducting the deleted records. \n")
    records_deleted = invenio_bibcodes - ads_bibcodes
    printmsg(verbose, "    %d records to delete." % len(records_deleted))

    records_to_check = invenio_bibcodes - records_deleted
    printmsg(verbose, "Checking timestamps for %d records. \n" % len(records_to_check))

    # TODO: This can probably be sped up by working with chunks of bibcodes
    # instead of single bibcodes.
    for bibcode in records_to_check:
        ads_timestamp = ads_timestamps[bibcode]

        invenio_recid = get_mysql_recid_from_aleph_sysno(bibcode)
        invenio_timestamp = get_fieldvalues(invenio_recid, "995__a")
        if not invenio_timestamp:
            # Maybe we could add instead of exiting.
            printmsg(True, "ERROR: Record %s in Invenio does not " "have a timestamp. \n" % bibcode)
            sys.exit(1)
        elif invenio_timestamp != ads_timestamp:
            records_modified.append(bibcode)

    printmsg(verbose, "Done.")

    return records_added, records_modified, records_deleted
Exemplo n.º 20
0
 def check_last_extraction(self):
     """method that checks if the last extraction finished properly"""
     printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3]))
         
     #I retrieve the list of entries in the output directory
     list_of_elements = os.listdir(settings.BASE_OUTPUT_PATH)
     #I extract only the directories
     directories = []
     for elem in list_of_elements:
         if os.path.isdir(os.path.join(settings.BASE_OUTPUT_PATH, elem)):
             directories.append(elem)
     
     #I set a variable for the latest dir of extraction
     self.lastest_extr_dir = ''
     
     #if I don't have any result I return the proper status
     if len(directories) == 0:
         printmsg(self.verbose, "Checked last extraction: status returned NOTHING FOUND \n")
         return 'NOTHING FOUND'
     else: 
         #I sort the directories in desc mode and I take the first one
         directories.sort(reverse=True)
         self.lastest_extr_dir = directories[0]
         
         printmsg(self.verbose, "Checking the directory %s \n" % os.path.join(settings.BASE_OUTPUT_PATH, self.lastest_extr_dir))
         
         #I extract the content of the last extraction
         elements_from_last_extraction = os.listdir(os.path.join(settings.BASE_OUTPUT_PATH, self.lastest_extr_dir))
         
         #then I check if all the mandatory files are there, otherwise
         for name in settings.BASE_FILES:
             if settings.BASE_FILES[name] not in elements_from_last_extraction:
                 printmsg(self.verbose, "Checked last extraction: status returned NOT VALID DIRECTORY CONTENT \n")
                 return 'NOT VALID DIRECTORY CONTENT'
     
         #if I pass all this checks the content is basically fine
         #But then I have to check if the lists of bibcodes are consistent: bibcodes extracted + bibcodes with problems = sum(bibcodes to extract)
         printmsg(self.verbose, "Checking if the list of bibcodes actually extracted is equal to the one I had to extract \n")
         bibcodes_still_pending = self.extr_diff_bibs_from_extraction(os.path.join(settings.BASE_OUTPUT_PATH, self.lastest_extr_dir))
         if len(bibcodes_still_pending) == 0:
             printmsg(self.verbose, "All the bibcodes from the last extraction have been processed \n")
         else:
             printmsg(self.verbose, "Checked last extraction: status returned LATEST NOT ENDED CORRECTLY \n")
             return 'LATEST NOT ENDED CORRECTLY'
     
     #if everything is Ok I return it
     printmsg(self.verbose, "Checked last extraction: status returned OK \n")
     return 'OK'
 def grouper(self, n, iterable):
     """method to split a list in multiple groups"""
     printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3]))
     args = [iter(iterable)] * n
     return list(([e for e in t if e != None] for t in itertools.izip_longest(*args)))
def extractor_process(q_todo, q_done, q_probl, lock_stdout, q_life, extraction_directory, extraction_name, verbose):
    """Worker function for the extraction of bibcodes from ADS
        it has been defined outside any class because it's more simple to treat with multiprocessing """
    # I get the maximum number of groups I can process
    max_num_groups = settings.MAX_NUMBER_OF_GROUP_TO_PROCESS
    # variable used to know if I'm exiting because the queue is empty or because I reached the maximum number of groups to process
    queue_empty = False

    # while there is something to process or I reach the maximum number of groups I can process,  I try to process
    for grpnum in range(max_num_groups):

        task_todo = q_todo.get()
        if task_todo[0] == "STOP":

            queue_empty = True
            # I exit the loop
            break

        # I print when I'm starting the extraction
        lock_stdout.acquire()
        printmsg(
            True, multiprocessing.current_process().name + (" (worker) starting to process group %s \n" % task_todo[0])
        )
        lock_stdout.release()

        ############
        # then I process the bibcodes
        # I define a couple of lists where to store the bibcodes processed
        bibcodes_ok = []
        bibcodes_probl = []

        # I define a ADSEXPORT object
        recs = ads.ADSExports.ADSRecords("full", "XML")

        # I define a maximum amount of bibcodes I can skip per each cicle: the number of bibcodes per group / 10 (minimum 500)
        # if i skip more than this amount it means that there is something
        # wrong with the access to the data and it's better to stop everything
        max_number_of_bibs_to_skip = max(settings.NUMBER_OF_BIBCODES_PER_GROUP / 10, 500)

        for bibcode in task_todo[1]:
            try:
                recs.addRecord(bibcode)
                bibcodes_ok.append(bibcode)
            except:
                printmsg(True, 'ERROR: problem retrieving the bibcode "%s" \n' % bibcode)
                bibcodes_probl.append(bibcode)
                max_number_of_bibs_to_skip = max_number_of_bibs_to_skip - 1
            # If i=I reach 0 It means that I skipped 1k bibcodes and probably there is a problem: so I simulate an exit for empty queue
            if max_number_of_bibs_to_skip == 0:
                break
        # I exit from both loops
        if max_number_of_bibs_to_skip == 0:
            lock_stdout.acquire()
            printmsg(
                True,
                multiprocessing.current_process().name
                + (
                    " (worker) Detected possible error with ADS data access: skipped %s bibcodes in one group \n"
                    % max(settings.NUMBER_OF_BIBCODES_PER_GROUP / 10, 500)
                ),
            )
            lock_stdout.release()
            queue_empty = True
            break

        # I extract the object I created
        xmlobj = recs.export()
        del recs

        try:
            # I define a transformation object
            transf = xml_transformer.XmlTransformer(verbose)
            # and I transform my object
            marcxml = transf.transform(xmlobj)
        except:
            raise GenericError("Impossible to transform the XML!")

        # if the transformation was ok, I write the file
        if marcxml:
            w2f = write_files.WriteFile(extraction_directory, verbose)
            wrote_filename = w2f.write_marcxml_file(marcxml, task_todo[0], extraction_name)
            # if the writing of the xml is wrong I consider all the bibcodes problematic
            if not wrote_filename:
                bibcodes_probl = bibcodes_probl + bibcodes_ok
                bibcodes_ok = []
            del w2f
        # otherwise I put all the bibcodes in the problematic
        else:
            bibcodes_probl = bibcodes_probl + bibcodes_ok
            bibcodes_ok = []
            wrote_filename = False

        # finally I pass to the done bibcodes to the proper file
        q_done.put([task_todo[0], bibcodes_ok, wrote_filename])
        # and the problematic bibcodes
        q_probl.put([task_todo[0], bibcodes_probl])

        lock_stdout.acquire()
        printmsg(
            True, multiprocessing.current_process().name + (" (worker) finished to process group %s \n" % task_todo[0])
        )
        lock_stdout.release()

    if queue_empty:
        # I tell the output processes that I'm done
        q_done.put(["WORKER DONE"])
        q_probl.put(["WORKER DONE"])
        # I tell the manager that I'm dying because the queue is empty
        q_life.put(["QUEUE EMPTY"])
        # I set a variable to skip the messages outside the loop
        lock_stdout.acquire()
        printmsg(True, multiprocessing.current_process().name + " (worker) Queue empty: exiting \n")
        lock_stdout.release()
    else:
        # I tell the manager that I'm dying because I reached the maximum amount of group to process
        q_life.put(["MAX LIFE REACHED"])
        lock_stdout.acquire()
        printmsg(
            True,
            multiprocessing.current_process().name
            + " (worker) Maximum amount of groups of bibcodes reached: exiting \n",
        )
        lock_stdout.release()
def extractor_manager_process(bibtoprocess_splitted, extraction_directory, extraction_name, verbose):
    """Process that takes care of managing all the other worker processes
        this process also creates new worker processes when the existing ones reach the maximum number of groups of bibcode to process 
    """
    # a queue for the bibcodes to process
    q_todo = multiprocessing.Queue()
    # a queue for the bibcodes processed
    q_done = multiprocessing.Queue()
    # a queue for the bibcodes with problems
    q_probl = multiprocessing.Queue()
    # a lock to write in stdout
    lock_stdout = multiprocessing.Lock()
    # a queue for the messages from the workers that have to tell the manager when they reach the maximum number of chunks to process
    q_life = multiprocessing.Queue()

    lock_stdout.acquire()
    printmsg(verbose, multiprocessing.current_process().name + " (Manager) Filling the queue with the tasks \n")
    lock_stdout.release()

    # I split all the bibcodes in groups of NUMBER_OF_BIBCODES_PER_GROUP and I put them in the todo queue
    counter = 0  # I need the counter to uniquely identify each group
    for grp in bibtoprocess_splitted:
        counter += 1
        q_todo.put([str(counter).zfill(7), grp])

    lock_stdout.acquire()
    printmsg(verbose, multiprocessing.current_process().name + " (Manager) Creating the first pool of workers \n")
    lock_stdout.release()

    # I define the number of processes to run
    number_of_processes = settings.NUMBER_WORKERS  # in production should be a part of multiprocessing.cpu_count

    # I define the worker processes
    processes = [
        multiprocessing.Process(
            target=extractor_process,
            args=(q_todo, q_done, q_probl, lock_stdout, q_life, extraction_directory, extraction_name, verbose),
        )
        for i in range(number_of_processes)
    ]

    # I append to the todo queue a list of commands to stop the worker processes
    for i in range(number_of_processes):
        q_todo.put(["STOP", ""])

    lock_stdout.acquire()
    printmsg(verbose, multiprocessing.current_process().name + " (Manager) Creating the output workers \n")
    lock_stdout.release()

    # I define a "done bibcode" worker
    donebib = multiprocessing.Process(
        target=done_extraction_process,
        args=(q_done, number_of_processes, lock_stdout, q_life, extraction_directory, verbose),
    )
    # I define a "problematic bibcode" worker
    problbib = multiprocessing.Process(
        target=problematic_extraction_process,
        args=(q_probl, number_of_processes, lock_stdout, q_life, extraction_directory, verbose),
    )

    lock_stdout.acquire()
    printmsg(verbose, multiprocessing.current_process().name + " (Manager) Starting all the workers \n")
    lock_stdout.release()

    # I start the worker processes
    for p in processes:
        p.start()
    # and the output handlers
    donebib.start()
    problbib.start()

    # I join all the processes
    # for p in processes:
    #    p.join()
    # donebib.join()
    # problbib.join()

    # then I have to wait for the workers that have to tell me if they reached the maximum amount of chunk to process or if the extraction ended
    # in the first case I have to start another process
    # in the second I have to decrease the counter of active workers
    active_workers = settings.NUMBER_WORKERS
    additional_workers = 2
    while active_workers > 0 or additional_workers > 0:
        # I get the message from the worker
        death_reason = q_life.get()
        # if the reason of the death is that the process reached the max number of groups to process, then I have to start another one
        if death_reason[0] == "MAX LIFE REACHED":
            newprocess = multiprocessing.Process(
                target=extractor_process,
                args=(q_todo, q_done, q_probl, lock_stdout, q_life, extraction_directory, extraction_name, verbose),
            )
            newprocess.start()
            additional_workers = additional_workers - 1
            lock_stdout.acquire()
            printmsg(True, multiprocessing.current_process().name + " (Manager) New worker created \n")
            lock_stdout.release()
        elif death_reason[0] == "QUEUE EMPTY":
            active_workers = active_workers - 1
            lock_stdout.acquire()
            printmsg(
                verbose,
                multiprocessing.current_process().name
                + " (Manager) %s workers waiting to finish their job \n" % str(active_workers),
            )
            lock_stdout.release()
        elif death_reason[0] == "PROBLEMBIBS DONE":
            additional_workers = additional_workers - 1
            lock_stdout.acquire()
            printmsg(
                verbose,
                multiprocessing.current_process().name
                + " (Manager) %s additional workers waiting to finish their job \n" % str(additional_workers),
            )
            lock_stdout.release()
        elif death_reason[0] == "DONEBIBS DONE":
            additional_workers = additional_workers - 1
            lock_stdout.acquire()
            printmsg(
                verbose,
                multiprocessing.current_process().name
                + " (Manager) %s additional workers waiting to finish their job \n" % str(additional_workers),
            )
            lock_stdout.release()

    lock_stdout.acquire()
    printmsg(verbose, multiprocessing.current_process().name + " (Manager) All the workers are done. Exiting... \n")
    lock_stdout.release()