Exemplo n.º 1
0
    def run(self):
        # Set the NLTK path (http://www.nltk.org/_modules/nltk/data.html)
        nltk_path.append(join(config.paths["rawdata"], "nltk"))

        try:
            # Check which classes are valid depending on min_docs_per_class
            nbprint('Loading classes')
            self.load_valid_classes()

            # Load the documents
            with data.document_writer(self.info) as document_writer:
                # Initialize info classes
                self.classinfo = ClassInfo()
                self.docinfo = DocumentInfo(document_writer)

                # Load documents and store class information in classinfo
                self.load_documents()

            # Print Meta Information
            self.docinfo.save_meta(self.info)
            self.classinfo.save_meta(self.info)

        except (LookupError, FileNotFoundError):
            raise ImporterError(
                info,
                'Directory "{}" does not contain the required corpus.'.format(
                    nltk_path))

        # Save the classes
        classes = self.classinfo.make_class_list()
        data.save_classes(classes, self.info)
Exemplo n.º 2
0
class NewsgroupImporter(ImporterBase):
    def load_documents(self):
        text_class_pairs = zip(self.rawdata.data, self.rawdata.target)
        for text, class_idx in ProgressIterator(text_class_pairs,
                                                doc_progress_label,
                                                length=len(self.rawdata.data)):
            classname = self.rawdata.target_names[class_idx]
            class_id = self.classinfo.increase_class_count(classname)
            self.docinfo.add_document(text, class_id)

    def run(self):
        # Load data with sklearn
        nbprint('Loading raw files')
        self.rawdata = fetch_20newsgroups(
            data_home=join(config.paths['rawdata'], 'sklearn'),
            remove=tuple(self.info['data_info']['remove']),
            subset='all')

        with data.document_writer(self.info) as document_writer:
            # Initialize info classes
            self.classinfo = ClassInfo()
            self.docinfo = DocumentInfo(document_writer)

            # Load documents and store class information in classinfo
            self.load_documents()

        # Print Meta Information
        self.docinfo.save_meta(self.info)
        self.classinfo.save_meta(self.info)

        # Save classinfo
        classes = self.classinfo.make_class_list()
        data.save_classes(classes, self.info)
Exemplo n.º 3
0
 def run(self):
     # Open Writer
     with data.document_writer(self.info) as document_writer:
         self.docinfo = DocumentInfo(document_writer)
         
         # Open archive
         self.archivepath = join(config.paths['rawdata'],'acm/abstract.zip')  
         self.import_archive()
         
         # Print Meta Info
         self.docinfo.save_meta(self.info)
Exemplo n.º 4
0
class ClassicImporter(ImporterBase):
    def load_file(self, filename):
        filename = join(self.folder, filename)
        with open(filename, "r", encoding="utf8") as file:
            text = file.read()
        return text

    def load_documents(self):
        for filename in ProgressIterator(self.files, doc_progress_label):
            classname = filename.split(".")[0]
            class_id = self.classinfo.increase_class_count(classname)
            text = self.load_file(filename)
            self.docinfo.add_document(text, class_id)

    def run(self):
        # Get all files in the classic4 directory
        self.folder = join(config.paths["rawdata"], "classic4")
        try:
            self.files = listdir(self.folder)
        except FileNotFoundError:
            raise ImporterError(
                info, 'Directory "{}" does not exist'.format(self.folder))

        # Remove .gitignore file from list
        self.files = [file for file in self.files if file != '.gitignore']

        # Keep only files that start with a classname
        self.classnames = ['cacm', 'cisi', 'cran', 'med']
        self.files = [
            file for file in self.files
            if '.' in file and file.split('.')[0] in self.classnames
        ]

        # Check if files exits
        if len(self.files) == 0:
            raise ImporterError(info,
                                'There are no valid files in the folder.')

        with data.document_writer(self.info) as document_writer:
            # Initialize info classes
            self.classinfo = ClassInfo()
            self.docinfo = DocumentInfo(document_writer)

            # Load documents and store class information in classinfo
            self.load_documents()

        # Print Meta Information
        self.docinfo.save_meta(self.info)
        self.classinfo.save_meta(self.info)

        # Save classinfo
        classes = self.classinfo.make_class_list()
        data.save_classes(classes, self.info)
Exemplo n.º 5
0
class YahooImporter(ImporterBase):
    def load_documents(self):
        nbprint('Loading xml file')

        self.documents = []
        filename = join(config.paths["rawdata"], "yahooL5/manner.xml")
        #i, max = 0, 100000
        current_doc = None

        for event, elem in etree.iterparse(filename,
                                           events=('start', 'end'),
                                           recover=True):
            #i = i+1
            #if i % math.floor(max/10) == 0:
            #    print(i/max)
            #if i > max:
            #    break;

            if elem.tag == "document":
                if event == "start":
                    current_doc = Document()
                elif event == "end":
                    if current_doc.complete():
                        self.documents.append(current_doc)
                    current_doc = None
            elif event == "end" and not current_doc is None:
                current_doc.add_elem(elem)

    def save_documents(self):
        nbprint('Saving documents')

        self.classinfo = ClassInfo()

        # Open Writer
        with data.document_writer(self.info) as document_writer:
            self.docinfo = DocumentInfo(document_writer)
            for doc in self.documents:
                text = doc.content['subject']
                class_id = self.classinfo.increase_class_count(
                    doc.content['maincat'])
                self.docinfo.add_document(text, class_id)

    def run(self):
        self.load_documents()
        self.save_documents()

        # Save the classes
        classes = self.classinfo.make_class_list()
        data.save_classes(classes, self.info)

        # Print Meta Info
        self.docinfo.save_meta(self.info)
        self.classinfo.save_meta(self.info)
Exemplo n.º 6
0
    def save_documents(self):
        nbprint('Saving documents')

        self.classinfo = ClassInfo()

        # Open Writer
        with data.document_writer(self.info) as document_writer:
            self.docinfo = DocumentInfo(document_writer)
            for doc in self.documents:
                text = doc.content['subject']
                class_id = self.classinfo.increase_class_count(
                    doc.content['maincat'])
                self.docinfo.add_document(text, class_id)
Exemplo n.º 7
0
class ComplaintsImporter(ImporterBase):
    def load_classes(self, file):
        self.valid_classes = ClassInfo()
        min_length = self.info['data_info']['min_length']
        cr = csv.reader(file)
        next(cr)
        for row in ProgressIterator(cr):
            classname = row[2]
            text = row[5]

            if len(text) >= min_length:
                self.valid_classes.increase_class_count(classname)
        min_class_size = self.info['data_info']['min_class_size']
        self.valid_classes = [
            c['info'] for c in self.valid_classes.make_class_list()
            if c['count'] > min_class_size
        ]

    def load_data(self, file):
        min_length = self.info['data_info']['min_length']
        cr = csv.reader(file)
        next(cr)
        for row in ProgressIterator(cr):
            classname = row[2]
            text = row[5]

            if len(text) >= min_length and classname in self.valid_classes:
                class_id = self.classinfo.increase_class_count(classname)
                self.docinfo.add_document(text, class_id)

    def run(self):
        self.classinfo = ClassInfo()
        filename = join(config.paths["rawdata"],
                        "complaints/consumer_complaints.csv")

        with open(filename) as file:
            self.load_classes(file)

        with data.document_writer(self.info) as document_writer:
            self.docinfo = DocumentInfo(document_writer)
            with open(filename) as file:
                self.load_data(file)

        # Save the classes
        classes = self.classinfo.make_class_list()
        data.save_classes(classes, self.info)

        # Print Meta Info
        self.docinfo.save_meta(self.info)
        self.classinfo.save_meta(self.info)
Exemplo n.º 8
0
class TweetsLAImporter(ImporterBase):
    def get_archives(self, folder):
        # List all files
        try:
            files = listdir(folder)
        except FileNotFoundError:
            raise ImporterError(info,
                                'Directory "{}" does not exist'.format(folder))

        # Keep only .zip files
        archives = [file for file in files if file.split(".")[-1] == "zip"]
        return archives

    def parse_file(self, jsonfile):
        for line in ProgressIterator(jsonfile, 'Parsing tweets'):
            tweet = json.loads(line)
            if 'extended_tweet' in tweet:
                text = tweet['extended_tweet']['full_text']
            elif 'text' in tweet:
                text = tweet['text']
            else:
                continue
            self.docinfo.add_document(text)

    def import_archive(self):
        # Iterate all files in archive
        with zipfile.ZipFile(self.archivepath) as zip:
            filenames = [info.filename for info in zip.infolist()]
            for filename in filenames:
                nbprint(filename)
                with zip.open(filename) as jsonfile:
                    self.parse_file(jsonfile)

    def run(self):
        # Open Writer
        with data.document_writer(self.info) as document_writer:
            self.docinfo = DocumentInfo(document_writer)

            # Iterate all archives
            folder = join(config.paths["rawdata"], "tweetsla")
            archives = self.get_archives(folder)
            for idx, archive in enumerate(archives):
                nbprint('{}/{}: {}'.format(idx + 1, len(archives),
                                           archive)).push()
                self.archivepath = join(folder, archive)
                self.import_archive()
                nbprint.pop()

            # Print Meta Info
            self.docinfo.save_meta(self.info)
Exemplo n.º 9
0
    def run(self):
        self.classinfo = ClassInfo()
        self.max_docs_per_cls = self.info['data_info'].get(
            'max_docs_per_cls', None)
        with data.document_writer(self.info) as document_writer:
            self.docinfo = DocumentInfo(document_writer)
            self.add_data("ODPtweets-Mar17-29")
            self.add_data("ODPtweets-Apr12-24")

        # Save the classes
        classes = self.classinfo.make_class_list()
        data.save_classes(classes, self.info)

        # Print Meta Info
        self.docinfo.save_meta(self.info)
        self.classinfo.save_meta(self.info)
Exemplo n.º 10
0
    def run(self):
        # Open Writer
        with data.document_writer(self.info) as document_writer:
            self.docinfo = DocumentInfo(document_writer)

            # Iterate all archives
            folder = join(config.paths["rawdata"], "tweetsla")
            archives = self.get_archives(folder)
            for idx, archive in enumerate(archives):
                nbprint('{}/{}: {}'.format(idx + 1, len(archives),
                                           archive)).push()
                self.archivepath = join(folder, archive)
                self.import_archive()
                nbprint.pop()

            # Print Meta Info
            self.docinfo.save_meta(self.info)
Exemplo n.º 11
0
    def run(self):
        self.classinfo = ClassInfo()
        filename = join(config.paths["rawdata"],
                        "complaints/consumer_complaints.csv")

        with open(filename) as file:
            self.load_classes(file)

        with data.document_writer(self.info) as document_writer:
            self.docinfo = DocumentInfo(document_writer)
            with open(filename) as file:
                self.load_data(file)

        # Save the classes
        classes = self.classinfo.make_class_list()
        data.save_classes(classes, self.info)

        # Print Meta Info
        self.docinfo.save_meta(self.info)
        self.classinfo.save_meta(self.info)
Exemplo n.º 12
0
    def run(self):
        folder = join(config.paths["rawdata"], "atd")

        # List txt files
        try:
            files = listdir(folder)
        except FileNotFoundError:
            raise ImporterError(info,
                                'Directory "{}" does not exist'.format(folder))

        # Keep only .txt files
        files = [file for file in files if file.split(".")[-1] == "txt"]

        # Check if files exist
        if len(files) == 0:
            raise ImporterError(info,
                                'There are no valid files in the folder.')

        # Add files one by one
        with data.document_writer(self.info) as document_writer:
            docinfo = DocumentInfo(document_writer)
            for filename in ProgressIterator(files, doc_progress_label):
                if filename.split(".")[-1] != "txt":
                    continue
                with open(join(folder, filename), "r",
                          encoding="utf8") as file:
                    text = file.read()
                    docinfo.add_document(text)
            # Print Meta Information
            docinfo.save_meta(self.info)
Exemplo n.º 13
0
    def run(self):
        # Get all files in the classic4 directory
        self.folder = join(config.paths["rawdata"], "classic4")
        try:
            self.files = listdir(self.folder)
        except FileNotFoundError:
            raise ImporterError(
                info, 'Directory "{}" does not exist'.format(self.folder))

        # Remove .gitignore file from list
        self.files = [file for file in self.files if file != '.gitignore']

        # Keep only files that start with a classname
        self.classnames = ['cacm', 'cisi', 'cran', 'med']
        self.files = [
            file for file in self.files
            if '.' in file and file.split('.')[0] in self.classnames
        ]

        # Check if files exits
        if len(self.files) == 0:
            raise ImporterError(info,
                                'There are no valid files in the folder.')

        with data.document_writer(self.info) as document_writer:
            # Initialize info classes
            self.classinfo = ClassInfo()
            self.docinfo = DocumentInfo(document_writer)

            # Load documents and store class information in classinfo
            self.load_documents()

        # Print Meta Information
        self.docinfo.save_meta(self.info)
        self.classinfo.save_meta(self.info)

        # Save classinfo
        classes = self.classinfo.make_class_list()
        data.save_classes(classes, self.info)
Exemplo n.º 14
0
    def run(self):
        # Load data with sklearn
        nbprint('Loading raw files')
        self.rawdata = fetch_20newsgroups(
            data_home=join(config.paths['rawdata'], 'sklearn'),
            remove=tuple(self.info['data_info']['remove']),
            subset='all')

        with data.document_writer(self.info) as document_writer:
            # Initialize info classes
            self.classinfo = ClassInfo()
            self.docinfo = DocumentInfo(document_writer)

            # Load documents and store class information in classinfo
            self.load_documents()

        # Print Meta Information
        self.docinfo.save_meta(self.info)
        self.classinfo.save_meta(self.info)

        # Save classinfo
        classes = self.classinfo.make_class_list()
        data.save_classes(classes, self.info)
Exemplo n.º 15
0
class ACMImporter(ImporterBase):
    
    def import_archive(self):
        # Iterate all files in archive
        with zipfile.ZipFile(self.archivepath) as zip:
            filenames = [info.filename for info in zip.infolist()]
            for filename in ProgressIterator(filenames): 
                if filename.endswith('.txt'):
                    with zip.open(filename, 'r') as txtfile:
                        text = txtfile.read().decode('utf-8')
                        self.docinfo.add_document(text)
                    
    def run(self):
        # Open Writer
        with data.document_writer(self.info) as document_writer:
            self.docinfo = DocumentInfo(document_writer)
            
            # Open archive
            self.archivepath = join(config.paths['rawdata'],'acm/abstract.zip')  
            self.import_archive()
            
            # Print Meta Info
            self.docinfo.save_meta(self.info)
Exemplo n.º 16
0
class TweetsODPImporter(ImporterBase):
    def parse_files(self, jsonfile):
        nbprint("Loading documents")
        for line in ProgressIterator(jsonfile):

            tweet = json.loads(line)
            text = tweet["full_text"]

            id = int(tweet["id_str"])  #id field is incorrect/rounded
            classname = self.id_to_classname[id]

            if (self.max_docs_per_cls is not None
                    and self.classinfo.classes.get(
                        classname, (0, 0))[1] >= self.max_docs_per_cls):
                continue
            else:
                class_id = self.classinfo.increase_class_count(classname)
                self.docinfo.add_document(text, class_id)

    def load_id_to_classname(self, folderpath, filename):
        nbprint("Extracting tsv")

        self.id_to_classname = {}
        max_depth = self.info['data_info']['maxdepth']
        tarfilename = join(folderpath, filename + ".tar.bz2")

        with tarfile.open(tarfilename, "r:bz2") as tar:
            tsvfile = tar.extractfile(filename + ".tsv")
            for line in ProgressIterator(tsvfile):
                fields = line.decode().split()
                id = int(fields[0])
                classname = fields[3]

                classname = classname.strip("*")
                classhierarchy = classname.split("/")
                classhierarchy = classhierarchy[1:max_depth + 1]
                classname = "/".join(classhierarchy)

                self.id_to_classname[id] = classname

    def add_data(self, filename):
        nbprint("Loading '{}'".format(filename)).push()
        folderpath = join(config.paths["rawdata"], "tweetsodp")
        jsonfilename = join(folderpath, filename + ".json")
        zipfilename = join(folderpath, filename + ".json.zip")

        self.load_id_to_classname(folderpath, filename)
        if isfile(jsonfilename):
            with open(jsonfilename, "r") as jsonfile:
                self.parse_files(jsonfile)
        else:
            with zipfile.ZipFile(zipfilename) as zip:
                with zip.open(filename + ".json") as jsonfile:
                    self.parse_files(jsonfile)
        nbprint.pop()

    def run(self):
        self.classinfo = ClassInfo()
        self.max_docs_per_cls = self.info['data_info'].get(
            'max_docs_per_cls', None)
        with data.document_writer(self.info) as document_writer:
            self.docinfo = DocumentInfo(document_writer)
            self.add_data("ODPtweets-Mar17-29")
            self.add_data("ODPtweets-Apr12-24")

        # Save the classes
        classes = self.classinfo.make_class_list()
        data.save_classes(classes, self.info)

        # Print Meta Info
        self.docinfo.save_meta(self.info)
        self.classinfo.save_meta(self.info)
Exemplo n.º 17
0
class ReutersImporter(ImporterBase):
    def count_docs_per_class(self):
        counts = {}
        for file in reuters.fileids():
            categories = reuters.categories(file)
            if len(categories) == 1:
                classname = categories[0]
                try:
                    counts[classname] += 1
                except KeyError:
                    counts[classname] = 1
        return counts

    def filter_classes(self, counts):
        newcounts = {}
        for key, val in counts.items():
            if val >= self.info['data_info']["min_docs_per_class"]:
                newcounts[key] = val
        return newcounts

    def load_valid_classes(self):
        counts = self.count_docs_per_class()
        counts = self.filter_classes(counts)
        self.valid_classes = list(counts)

    def load_documents(self):
        for file in ProgressIterator(reuters.fileids(), doc_progress_label):
            categories = reuters.categories(file)
            if len(categories) > 1:
                continue
            classname = categories[0]
            if not classname in self.valid_classes:
                continue
            class_id = self.classinfo.increase_class_count(classname)

            text = " ".join(reuters.words(file))
            text = re.sub("(\d+) \. (\d+)", r"\1.\2", text)
            text = re.sub("(\d+) \, (\d+)", r"\1,\2", text)
            text = re.sub(" \.", ".", text)
            text = re.sub(" \.", ".", text)
            text = re.sub(" \,", ",", text)
            text = re.sub(" \)", ")", text)
            text = re.sub("\( ", "(", text)
            text = re.sub(" \\' ", "'", text)

            self.docinfo.add_document(text, class_id)

    def run(self):
        # Set the NLTK path (http://www.nltk.org/_modules/nltk/data.html)
        nltk_path.append(join(config.paths["rawdata"], "nltk"))

        try:
            # Check which classes are valid depending on min_docs_per_class
            nbprint('Loading classes')
            self.load_valid_classes()

            # Load the documents
            with data.document_writer(self.info) as document_writer:
                # Initialize info classes
                self.classinfo = ClassInfo()
                self.docinfo = DocumentInfo(document_writer)

                # Load documents and store class information in classinfo
                self.load_documents()

            # Print Meta Information
            self.docinfo.save_meta(self.info)
            self.classinfo.save_meta(self.info)

        except (LookupError, FileNotFoundError):
            raise ImporterError(
                info,
                'Directory "{}" does not contain the required corpus.'.format(
                    nltk_path))

        # Save the classes
        classes = self.classinfo.make_class_list()
        data.save_classes(classes, self.info)