示例#1
0
    def hydrate_tags_table(self, col, lookup_data, lookup_ids=False):
        self.cursor.execute(
            'SELECT DISTINCT original_tag from tags WHERE original_column="{0}"'
            .format(col))
        results = self.cursor.fetchall()
        total = len(results)
        cur = 0
        print "{0} tags for column '{1}'".format(total, col)

        for tag_row in results:
            cur = Common.print_progress(cur, total)

            # Get tag data
            parent = ''
            extra_column = ', {0} as description'.format(
                lookup_data['extra_column']
            ) if 'extra_column' in lookup_data else ''
            lookup_column = ', {0} as parent'.format(
                lookup_data['lookup_field']
            ) if 'lookup_field' in lookup_data else ''
            matching_field = lookup_data[
                'id_name'] if lookup_ids else lookup_data['field_name']

            dict_cursor = self.db.cursor(MySQLdb.cursors.DictCursor)
            dict_cursor.execute("""
          SELECT {0}, {1}{2}{3} FROM {4} WHERE {5}='{6}'
        """.format(lookup_data['id_name'], lookup_data['field_name'],
                   lookup_column, extra_column, lookup_data['table_name'],
                   matching_field, tag_row[0]))
            tag = dict_cursor.fetchone()
            if tag is not None:
                # Get parent data
                if 'lookup_field' in lookup_data:
                    self.cursor.execute(
                        "SELECT {0} FROM {1} WHERE {2}='{3}'".format(
                            lookup_data['lookup_table_field'],
                            lookup_data['lookup_table'],
                            lookup_data['lookup_id'], tag['parent']))
                    result = self.cursor.fetchone()
                    parent = result[0] if result is not None else ''

                # Update the table
                description = tag['description'] if 'description' in tag else ''
                self.hydrate_tag_row(tag_id=tag[lookup_data['id_name']],
                                     tag_to_look_up=tag_row[0],
                                     tag=tag[lookup_data['field_name']],
                                     table=lookup_data['table_name'],
                                     col=col,
                                     parent=parent,
                                     description=description)
示例#2
0
def _create_mysql(args, FILES, log):
    db = connect(args.db_host, args.db_user, args.db_password, "")
    cursor = db.cursor()
    DATABASE_NAME = args.temp_db_database

    # Use the database and empty all the tables
    cursor.execute(u"drop database if exists {0};".format(DATABASE_NAME))
    cursor.execute(u"create database {0};".format(DATABASE_NAME))
    cursor.execute(u"use {0}".format(DATABASE_NAME))

    sql = Sql(args)
    sql.run_script_from_file('shared_python/create-open-doors-tables.sql',
                             DATABASE_NAME)
    db.commit()

    authors = [(FILES[i].get('Author', '').strip(),
                FILES[i].get('Email', FILES[i].get('EmailAuthor',
                                                   '')).lower().strip())
               for i in FILES]
    auth = u"INSERT INTO authors (name, email) VALUES(%s, %s);"
    cursor.executemany(auth, set(authors))
    db.commit()

    # Authors
    auth = u"SELECT * FROM authors;"
    cursor.execute(auth)
    db_authors = cursor.fetchall()

    # Stories and bookmarks
    stories = [(
        i,
        FILES[i].get('Title', '').replace("'", "\\'"),
        FILES[i].get('Summary', '').replace("'", "\\'"),
        _extract_tags(args, FILES[i]),
        _extract_characters(args, FILES[i]),
        datetime.datetime.strptime(
            FILES[i].get(
                'PrintTime', FILES[i].get(
                    'DatePrint', FILES[i].get(
                        'Date',
                        str(datetime.datetime.now().strftime('%m/%d/%y'))))),
            '%m/%d/%y').strftime('%Y-%m-%d'),
        FILES[i].get('Location', '').replace("'", "\\'"),
        FILES[i].get('LocationURL', FILES[i].get('StoryURL',
                                                 '')).replace("'", "\\'"),
        FILES[i].get('Notes', '').replace("'", "\\'"),
        _extract_relationships(args, FILES[i]),
        FILES[i].get('Rating', ''),
        FILES[i].get('Warnings', '').replace("'", "\\'"),
        FILES[i].get('Author', '').strip(),
        FILES[i].get('Email', FILES[i].get('EmailAuthor', '')).lower().strip(),
        FILES[i].get('FileType', args.chapters_file_extensions)
        if not _is_external(FILES[i]) else 'bookmark',
        _extract_fandoms(args, FILES[i]),
    ) for i in FILES]

    cur = 0
    total = len(FILES)
    for (original_id, title, summary, tags, characters, date, location, url,
         notes, pairings, rating, warnings, author, email, filetype,
         fandoms) in set(stories):

        cur = Common.print_progress(cur, total)
        try:
            # For AA archives with external links:
            if filetype != 'bookmark':
                if location == '':
                    filename = url
                else:
                    filename = location + '.' + filetype
                table_name = 'stories'
            else:
                filename = url
                table_name = 'bookmarks'

            # Clean up fandoms and add default fandom if it exists
            final_fandoms = fandoms.replace("'", r"\'")
            if args.default_fandom is not None:
                if final_fandoms == '' or final_fandoms == args.default_fandom:
                    final_fandoms = args.default_fandom
                else:
                    final_fandoms = args.default_fandom + ', ' + final_fandoms

            result = [
                element for element in db_authors
                if element[1] == author and element[2] == email
            ]
            authorid = result[0][0]

            stor = u"""
        INSERT INTO {0} (id, fandoms, title, summary, tags, characters, date, url, notes, relationships, rating, warnings, author_id)
        VALUES({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}', '{10}', '{11}', '{12}', '{13}');\n""" \
              .format(table_name,
                      original_id,
                      final_fandoms.replace(r"\\", "\\"),
                      title.replace(r"\\", "\\"),
                      summary,
                      tags,
                      characters,
                      date,
                      filename,
                      notes,
                      pairings,
                      rating,
                      warnings,
                      authorid)
            cursor.execute(stor)
        except:
            log.error("table name: {0}\noriginal id: {1}\nfinal fandoms: '{2}'\ntitle: '{3}'\nsummary: '{4}'\ntags: '{5}'" \
                  "\ncharacters: '{6}'\ndate: '{7}'\nfilename: '{8}'\nnotes: '{9}'\npairings: '{10}'\nrating: '{11}'" \
                  "\nwarnings: '{12}'\nauthor id: '{13}'"\
              .format(table_name,
                  original_id,
                  final_fandoms,
                  title,
                  summary,
                  tags,
                  characters,
                  date,
                  filename,
                  notes,
                  pairings,
                  rating,
                  warnings,
                  authorid))
            raise
    db.commit()
示例#3
0
    def _gather_and_dedupe(self, chapters_path, extensions, has_ids=False):
        self.log.info("\nFinding chapters and identifying duplicates")
        extensions = re.split(r", ?", extensions)
        story_folder = os.walk(chapters_path)
        file_paths = {}
        duplicate_chapters = {}
        has_duplicates = False
        messages = []
        sql_messages = []
        cur = 0

        for root, _, filenames in story_folder:
            total = len(filenames)
            Common.print_progress(cur, total)

            for filename in filenames:
                if has_ids and self._ends_with(filename, extensions):
                    file_path = os.path.join(root, filename)
                    cid = os.path.splitext(filename)[0]
                    if cid not in file_paths.keys():
                        file_paths[cid] = file_path
                    else:
                        duplicate_folder = os.path.split(
                            os.path.split(file_path)[0])[1]
                        messages.append(file_path + " is a duplicate of " +
                                        file_paths[cid])
                        sql_messages.append(
                            "SELECT * FROM chapters WHERE id = {1}".format(
                                cid))
                        duplicate_chapters[cid] = [{
                            'folder_name':
                            os.path.split(os.path.split(
                                file_paths[cid])[0])[1],
                            'filename':
                            filename,
                            'path':
                            file_paths[cid]
                        }, {
                            'folder_name': duplicate_folder,
                            'filename': filename,
                            'path': file_path
                        }]
                        has_duplicates = True
                else:
                    file_path = os.path.join(root, filename)
                    name = os.path.splitext(filename)[0]
                    file_paths[name] = file_path

        if has_duplicates:
            self.log.warn('\n'.join(messages + sql_messages))
            self.log.warn(duplicate_chapters)
            folder_name_type = raw_input(
                "Resolving duplicates: pick the type of the folder name under {0} "
                "\n1 = author id\n2 = author name\n3 = skip duplicates check\n"
                .format(chapters_path))
            if folder_name_type == '1':
                for cid, duplicate in duplicate_chapters.items():
                    # look up the author id and add that one to the file_names list
                    self.cursor.execute(
                        "SELECT author_id FROM chapters WHERE id = {1}".format(
                            cid))
                    sql_author_id = self.cursor.fetchall()
                    if len(sql_author_id) > 0:
                        author_id = sql_author_id[0][0]
                        file_paths[cid] = [
                            dc['path'] for dc in duplicate_chapters[cid]
                            if dc['folder_name'] == str(author_id)
                        ][0]
            elif folder_name_type == '2':
                self.log.warn("Not implemented")

        return file_paths
示例#4
0
    def populate_chapters(self, folder=None, extensions=None):
        if folder is None:
            folder = self.args.chapters_path
        if extensions is None:
            extensions = self.args.chapters_file_extensions

        self.log.info("Processing chapters...")

        filenames_are_ids = raw_input(
            "\nChapter file names are chapter ids? Y/N\n")
        has_ids = True if str.lower(filenames_are_ids) == 'y' else False
        file_paths = self._gather_and_dedupe(folder, extensions, has_ids)

        char_encoding = raw_input(
            "\n\nImporting chapters: pick character encoding (check for curly quotes):\n"
            "1 = Windows 1252\nenter = UTF-8\n")

        if char_encoding == '1':
            char_encoding = 'cp1252'
        else:
            char_encoding = 'utf8'

        cur = 0
        total = len(file_paths)

        if has_ids:
            for cid, chapter_path in file_paths.items():
                with codecs.open(chapter_path, 'r',
                                 encoding=char_encoding) as c:
                    try:
                        cur = Common.print_progress(cur, total)
                        file_contents = c.read()
                        query = "UPDATE {0}.chapters SET text=%s WHERE id=%s".format(
                            self.args.output_database)
                        self.cursor.execute(query, (file_contents, int(cid)))
                        self.db.commit()
                    except Exception as e:
                        self.log.error(
                            "Error = chapter id: {0} - chapter: {1}\n{2}".
                            format(cid, chapter_path, str(e)))
                    finally:
                        pass
        else:
            for _, chapter_path in file_paths.items():
                path = chapter_path.replace(self.args.chapters_path, '')[1:]
                with codecs.open(chapter_path, 'r',
                                 encoding=char_encoding) as c:
                    try:
                        cur = Common.print_progress(cur, total)
                        file_contents = c.read()
                        query = "UPDATE {0}.chapters SET text=%s WHERE url=%s and text=''".format(
                            self.args.output_database)
                        self.cursor.execute(query, (file_contents, path))
                        self.db.commit()
                    except Exception as e:
                        self.log.error(
                            "Error = chapter id: {0} - chapter: {1}\n{2}".
                            format(path, chapter_path, str(e)))
                    finally:
                        pass

        self.db.close()
示例#5
0
    def _gather_and_dedupe(self, chapters_path, extensions):
        extensions = re.split(r", ?", extensions)
        story_folder = os.walk(chapters_path)
        file_paths = {}
        duplicate_chapters = {}
        error = False
        messages = []
        sql_messages = []
        cur = 0
        for root, _, filenames in story_folder:
            total = len(filenames)
            Common.print_progress(cur, total)
            for filename in filenames:

                if self._ends_with(filename, extensions):
                    file_path = os.path.join(root, filename)
                    cid = os.path.splitext(filename)[0]
                    if cid not in file_paths.keys():
                        file_paths[cid] = file_path
                    else:
                        duplicate_folder = os.path.split(
                            os.path.split(file_path)[0])[1]
                        messages.append(file_path + " is a duplicate of " +
                                        file_paths[cid])
                        sql_messages.append(
                            "SELECT * FROM {0}_chapters WHERE id = {1}".format(
                                self.args.db_table_prefix, cid))
                        duplicate_chapters[cid] = [{
                            'folder_name':
                            os.path.split(os.path.split(
                                file_paths[cid])[0])[1],
                            'filename':
                            filename,
                            'path':
                            file_paths[cid]
                        }, {
                            'folder_name': duplicate_folder,
                            'filename': filename,
                            'path': file_path
                        }]
                        error = True

        if error:
            print '\n'.join(messages + sql_messages)
            print duplicate_chapters
            folder_name_type = raw_input(
                "Resolving duplicates: pick the type of the folder name under {0} \n1 = author id\n2 = author name\n"
                .format(chapters_path))
            if folder_name_type == '1':
                for cid, duplicate in duplicate_chapters.items():
                    # look up the author id and add that one to the file_names list
                    self.cursor.execute(
                        "SELECT authorid FROM {0}_chapters WHERE id = {1}".
                        format(self.args.db_table_prefix, cid))
                    sql_author_id = self.cursor.fetchall()
                    if len(sql_author_id) > 0:
                        author_id = sql_author_id[0][0]
                        file_paths[cid] = [
                            dc['path'] for dc in duplicate_chapters[cid]
                            if dc['folder_name'] == str(author_id)
                        ][0]

        return file_paths