示例#1
0
    def preprocess(self):
        """Preprocess the documents in Data/Unprocessed 
        Finds a doc's CIKs and creates hard links in the folder 
        Preprocessed/CIK. If a doc doesn't parse properly, it is 
        moved to Data/Exceptions instead. 
        The pre-processing step allows us to consider only one CIK 
        at a time during the processing step, for memory efficiency.
        """
        n_proc = 0
        n_valid = 0
        n_except = 0
        start = time.time()
        os.chdir(self.DataDir + 'Unprocessed/')
        for (docpath, docname) in recursive_file_gen('.'):
        # Returns (path, filename) tuples for all files in directory 
        # and subdirectories that don't begin with '.' or '_'
            if docname in self.processed_docs: continue
            self.processed_docs.add(docname) 
            n_proc += 1
            # Code assumes that docnames are unique
            try:
                (header, cik2filers, _) = rb_parser.parse_quarterly_filing(docpath)
                # Returns (but doesn't process) the raw text. 
                date     = header['FilingDate']
                doctype  = header['DocType']
                for CIK in cik2filers.iterkeys():
                    new_docname = CIK + '_' + date + '.txt'
                    ensure(self.DataDir + 'Preprocessed/' + CIK)
                    safelink(docpath, self.DataDir + 'Preprocessed/' + CIK + '/' + new_docname)
                    if new_docname in self.valid_docs:
                        print "Repeated doc: %s" % new_docname
                    self.valid_docs.add(new_docname)
                    n_valid += 1
                if n_valid != len(self.valid_docs):
                    pass#debug()

            except ParseError as e:
                self.exception_docs.add(docname)
                n_except += 1
                logging.warning(docname + ": " + str(e))
                safelink(docpath, self.DataDir + 'Exceptions/' + basename(docpath))


            # if n_proc > n_valid + n_except:
            #     print "Warning: proc %d, valid %d, except %d" % (n_proc, n_valid, n_except)
            # elif n_proc % 100 == 0:
            #     print "Proc %d, valid %d, except %d, combined %d" % (n_proc, n_valid, n_except, n_valid + n_except)
            #     if n_proc != len(self.processed_docs) or n_valid != len(self.valid_docs) or n_except != len(self.exception_docs):
            #         debug()

        end = time.time()
        print "Time elapsed in preprocessing: %.1f" % (end-start)
示例#2
0
    def process(self):
        start = time.time()
        os.chdir(self.DataDir + 'Preprocessed')
        # Iterate through all the preprocessed CIKs
        for CIK in os.listdir('.'):
            if CIK[0] == '.' or not os.path.isdir(CIK): continue

            if CIK in self.good_CIKs:
                self.active_CIKs.add(CIK)
                company = self.load_company(CIK)
                ensure(self.DataDir + 'Active/' + CIK)
                if CIK not in self.CIK2date:
                    self.CIK2date[CIK] = []
                for filing in os.listdir(CIK):
                    filingpath = CIK + '/' + filing
                    (header, filers, rawtext) = rb_parser.parse_quarterly_filing(filingpath)
                    company.properties(filers) 
                    # Update company properties with info taken from the 'filers' part of the document
                    date = header['FilingDate']
                    company.add_document(date, rawtext)
                    # Creates a word dictionary and wordcount from the raw text returned by the parser
                    self.CIK2date[CIK].append(date)
                    self.active_docs.add(filing)
                    os.rename(filingpath, self.DataDir + 'Active/' + filingpath)
                    # Move the filing to the 'Active' directory - note this means atm all parsed data is stored in the directory structure
                company.build_wordset()
                self.company_word_sets.append(company.wordset)
                self.save_company(company)
                SIC = company.SIC
                
                try: 
                    if CIK not in self.industries[SIC]:
                        self.industries[SIC].append(CIK)
                except KeyError:
                    self.industries[SIC] = [CIK]
                del company # Get it out of memory. Probably unnecessary

            else: # if CIK not in self.goodCIKs
                self.inactive_CIKs.add(CIK)
                ensure(self.DataDir + 'Inactive/' + CIK)
                for filing in os.listdir(CIK):
                    self.inactive_docs.add(filing)
                    os.rename(CIK +'/'+ filing, 
                        self.DataDir + 'Inactive/' + CIK +'/'+ filing)
            os.removedirs(CIK)
        end = time.time()
        print "Time elapsed in processing: %.1f" % (end-start)