예제 #1
0
def rename_groups(databases=DBS):
    with open(os.path.join(DIR_PATH, 'name_groups.tsv'), 'r',
              encoding='utf-8') as fh:
        lines = fh.read().strip().split('\n')

    # First column is always the true name
    # Create dict with each false name as key and it's true name as value
    name_map = dict()
    for line in lines:
        names = line.split('\t')
        if len(names) > 1:
            for i in range(1, len(names)):
                name_map[names[i]] = names[0]

    # Read name indexes and existing files for all supported measurement databases
    dbs = []
    for db in databases:
        if os.path.isfile(os.path.join(DIR_PATH, db, 'name_index.tsv')):
            # Read name index
            name_index = NameIndex.read_tsv(
                os.path.join(DIR_PATH, db, 'name_index.tsv'))
        else:
            # No name index, create one anew
            name_index = NameIndex()
        # Read all the existing files for the database
        files = list(
            glob(os.path.join(DIR_PATH, db, 'data', '**', '*.csv'),
                 recursive=True))
        files = [{
            'name': os.path.split(file)[1].replace('.csv', ''),
            'path': file
        } for file in files]
        # Save both to dbs
        dbs.append({'name': db, 'name_index': name_index, 'files': files})

    for old_name, new_name in name_map.items():
        print(f'"{old_name}" -> "{new_name}"')
        for db in dbs:
            name_index = db['name_index']
            # Replace true names in name index with the new name
            updated_item = False
            matches = name_index.find(true_name=old_name)
            for item in matches.items:
                if new_name == 'ignore':
                    name_index.update(NameItem(false_name=item.false_name,
                                               true_name=item.true_name,
                                               form='ignore'),
                                      true_name=old_name)
                    print(
                        f'    Updated item: "{item.false_name}", "{new_name}", "ignore"'
                    )
                else:
                    name_index.update(NameItem(false_name=item.false_name,
                                               true_name=new_name,
                                               form=item.form),
                                      true_name=old_name)
                    print(
                        f'    Updated item: "{item.false_name}", "{new_name}", "{item.form}"'
                    )
                updated_item = True

            # Rename existing files
            for name, path in [(f['name'], f['path']) for f in db['files']
                               if f['name'].lower() == old_name.lower()]:
                if new_name == 'ignore':
                    print(f'    Removing "{os.path.split(path)[0]}"')
                    shutil.rmtree(os.path.split(path)[0])
                    if not updated_item:
                        name_index.add(
                            NameItem(false_name=old_name,
                                     true_name=None,
                                     form='ignore'))
                        print(f'    Added item: "{old_name}", "", "ignore"')
                    continue

                new_path = re.sub(re.escape(name), new_name, path)
                print(
                    f'    Moving "{os.path.relpath(path, DIR_PATH)}" to "{os.path.relpath(new_path, DIR_PATH)}"'
                )
                os.makedirs(os.path.split(new_path)[0], exist_ok=True)
                shutil.move(path, new_path)
                os.rmdir(os.path.join(path, os.pardir))
                matches = name_index.find(true_name=new_name)
                if not matches:
                    d = path
                    while True:
                        d, f = os.path.split(d)
                        if f in ['onear', 'inear', 'earbud']:
                            form = f
                            break
                    name_index.add(
                        NameItem(false_name=old_name,
                                 true_name=new_name,
                                 form=form))
                    print(
                        f'    Added item: "{old_name}", "{new_name}", "{form}"'
                    )
        print()

    for db in dbs:
        db['name_index'].write_tsv(
            os.path.join(DIR_PATH, db['name'], 'name_index.tsv'))
예제 #2
0
class CrinacleCrawler(Crawler):
    def __init__(self, driver=None):
        self.book_name_index = None
        super().__init__(driver=driver)

    def get_name_proposals(self):
        """Downloads parses phone books to get names

        Returns:
            NameIndex
        """
        names = super().get_name_proposals()
        rows = []

        # Ears-711 measurements name index
        res = requests.get(
            'https://crinacle.com/graphing/data_hp/phone_book.json')
        hp_book = self.parse_book(res.json())
        for false_name, true_name in hp_book.items():
            rows.append([false_name, true_name, 'onear'])

        # IEM measurements name index
        res = requests.get(
            'https://crinacle.com/graphing/data/phone_book.json')
        iem_book = self.parse_book(res.json())
        for false_name, true_name in iem_book.items():
            rows.append([false_name, true_name, 'inear'])

        # Gras measurments name index
        res = requests.get(
            'https://crinacle.com/graphing/data_hp_gras/phone_book.json')
        gras_book = self.parse_book(res.json())
        for false_name, true_name in gras_book.items():
            rows.append([false_name, true_name, 'onear'])

        self.book_name_index = NameIndex(rows)

        names.concat(NameIndex(rows))
        names.remove_duplicates()
        return names

    @staticmethod
    def parse_book(data):
        """Parses a phone book as dict with false names as keys and true names as values.

        Args:
            data: Phone book object

        Returns:
            Dict with false names and true names
        """
        book = dict()
        for manufacturer in data:
            manufacturer_name = manufacturer['name']
            if 'suffix' in manufacturer:
                manufacturer_name += f' {manufacturer["suffix"]}'
            for model in manufacturer['phones']:
                if type(model) == str:
                    # Plain string
                    book[model.strip()] = f'{manufacturer_name} {model}'.strip(
                    )

                else:
                    # Object
                    if type(model['file']) == str:
                        # Single file as string, wrap in list
                        model['file'] = [model['file']]

                    if 'suffix' in model:
                        for f, suffix in zip(model['file'], model['suffix']):
                            book[f.strip(
                            )] = f'{manufacturer_name} {model["name"]} {suffix}'.strip(
                            )
                    else:
                        for f in model['file']:
                            book[f.strip(
                            )] = f'{manufacturer_name} {model["name"]}'.strip(
                            )

        return book

    @staticmethod
    def read_name_index():
        return NameIndex.read_tsv(os.path.join(DIR_PATH, 'name_index.tsv'))

    def write_name_index(self):
        self.name_index.write_tsv(os.path.join(DIR_PATH, 'name_index.tsv'))

    @staticmethod
    def get_existing():
        return NameIndex.read_files(
            os.path.join(DIR_PATH, 'data', '**', '*.csv'))

    def get_urls(self):
        # Link source is not a web page but raw_data folder
        file_paths = dict()

        def add_to(_fp, _rig):
            name = os.path.split(fp)[1]
            name = re.sub(r' [LR]\d*\.txt', '', name).replace('.txt', '')
            name = re.sub(r' #\d$', '', name)
            if name not in file_paths:
                file_paths[name] = dict()
            if _rig not in file_paths[name]:
                file_paths[name][_rig] = []
            file_paths[name][_rig].append(fp)

        patreon_dir = os.path.join(DIR_PATH, 'raw_data')

        # IEMs
        iem_source_paths = list(
            glob(os.path.join(patreon_dir, 'IEM Measurements (TSV)', '*.txt')))
        for fp in iem_source_paths:
            add_to(fp, 'iem')

        # Ears + 711
        legacy_source_paths = list(
            glob(os.path.join(patreon_dir, 'Legacy Data (EARS + 711)',
                              '*.txt')))
        for fp in legacy_source_paths:
            add_to(fp, 'legacy')

        # Gras
        gras_source_paths = list(
            glob(os.path.join(patreon_dir, 'FR Data (CSV)', '*.txt')))
        for fp in gras_source_paths:
            add_to(fp, 'gras')

        for name, rigs_and_file_paths in file_paths.items():
            if ('iem' in rigs_and_file_paths
                    and ('legacy' in rigs_and_file_paths
                         or 'gras' in rigs_and_file_paths)):
                # Remove IEM rig measurements if Ears-711 or GRAS measurements exist
                # This means the headphone is onear model and the files found in IEM folder are duplicates
                del rigs_and_file_paths['iem']

        return file_paths

    def process(self, item, file_paths, target_dir=None):
        if target_dir is None:
            raise TypeError('"target_dir" must be given')
        avg_fr = FrequencyResponse(name=item.true_name)
        avg_fr.raw = np.zeros(avg_fr.frequency.shape)
        for fp in file_paths:
            with open(fp, 'r', encoding='utf-8') as fh:
                s = fh.read()

            freq = []
            raw = []
            for line in s.split('\n'):
                if len(line) == 0 or line[0] == '*':
                    # Skip empty lines and comments
                    if 'C-weighting compensation: On' in line:
                        print(f'C-weighted measurement: {item.false_name}')
                    continue

                frp = line.split(', ')
                if len(frp) == 1:
                    frp = line.split('\t')
                if len(frp) == 1:
                    frp = line.split(' ')
                if len(frp) == 2:
                    f, r = frp
                elif len(frp) == 3:
                    f, r, p = frp
                else:
                    # Must be comment line
                    continue

                if f == '?' or r == '?':
                    # Skip lines with missing data
                    continue

                try:
                    freq.append(float(f))
                    raw.append(float(r))
                except ValueError as err:
                    # Failed to convert values to floats, must be header or comment row, skip
                    continue

            # Create standard fr object
            fr = FrequencyResponse(name=item.true_name,
                                   frequency=freq,
                                   raw=raw)
            fr.interpolate()
            fr.center()
            avg_fr.raw += fr.raw

        avg_fr.raw /= len(file_paths)

        # Save
        dir_path = os.path.join(target_dir, avg_fr.name)
        os.makedirs(dir_path, exist_ok=True)
        file_path = os.path.join(dir_path, f'{avg_fr.name}.csv')
        avg_fr.write_to_csv(file_path)
        print(f'Saved "{avg_fr.name}" to "{file_path}"')

    def prompt(self, false_name, form=None):
        """Prompts user for true name and form based on false name."""
        if self.name_proposals is not None:
            intermediate_name = self.book_name_index.find(
                false_name=false_name)
            if len(intermediate_name) == 0:
                intermediate_name = false_name
            else:
                intermediate_name = intermediate_name.items[0].true_name
            # Name proposals initialized, add matching entries to options in prompt
            matches = []
            matches += self.name_proposals.search_by_false_name(
                intermediate_name)
            matches += self.name_proposals.search_by_true_name(
                intermediate_name)
            names_and_ratios = []
            for match in matches:
                if not match[0].true_name:
                    # Skip items without true name
                    continue
                if form is not None and form != match[0].form:
                    # Skip items which don't match the given form
                    continue
                if match[1] == 100:
                    # Exact match
                    match[0].true_name += ' ✓'
                if match[0].true_name.replace(' ✓', '') not in [
                        x[0].replace(' ✓', '') for x in names_and_ratios
                ]:
                    # New match
                    names_and_ratios.append(
                        (match[0].true_name, match[1], match[0].form))
                else:
                    # Existing match, update ratio
                    for i in range(len(names_and_ratios)):
                        if match[0].true_name.replace(
                                ' ✓', '') == names_and_ratios[i][0].replace(
                                    ' ✓', ''):
                            if match[1] > names_and_ratios[i][1]:
                                names_and_ratios[i] = (match[0].true_name,
                                                       match[1],
                                                       names_and_ratios[i][2])

            name_options = [
                x[0] for x in sorted(
                    names_and_ratios, key=lambda x: x[1], reverse=True)[:4]
            ]
            if intermediate_name not in [
                    s.replace(' ✓', '') for s in name_options
            ]:
                name_options.append(intermediate_name)  # Add the false name

            # Prompt
            true_name = self.prompt_true_name(name_options)

            if true_name is None:
                return None

            # Find and replace true manufacturer name or prompt it
            if self.manufacturers.find(true_name)[0] is None:
                # Unknown manufacturer, find options with the two first words and prompt it
                manufacturer_options = []
                for i in range(1, min(3, len(true_name.split()))):
                    candidate = ' '.join(true_name.split()[:i])
                    print(candidate)
                    manufacturer_options += self.manufacturers.search(
                        candidate)
                    if candidate not in [x[0] for x in manufacturer_options]:
                        manufacturer_options.append((candidate, 0))
                manufacturer_options = sorted(manufacturer_options,
                                              key=lambda x: x[1],
                                              reverse=True)
                manufacturer_options = [x[0] for x in manufacturer_options]
                manufacturer, replace = self.prompt_manufacturer(
                    manufacturer_options)
                _, match = self.manufacturers.find(manufacturer)
                if match:
                    # Add as a new variant in existing manufacturer
                    for m in self.manufacturers.manufacturers:
                        if m[0] == match:
                            m.append(replace)
                else:
                    # Add new manufacturer
                    self.manufacturers.manufacturers.append([manufacturer])
                self.manufacturers.write()
            # Replace
            true_name = self.manufacturers.replace(true_name)

            # Find the answer and select form
            for name, ratio, f in names_and_ratios:
                if true_name == name:
                    form = f
                    break
            true_name = true_name.replace(' ✓', '')

        else:
            true_name = self.prompt_true_name([false_name])

        if true_name is None:
            # User skipped
            return None

        if form is None:
            # Form not found in name proposals, prompt it
            form = self.prompt_form()

        return NameItem(false_name, true_name, form)

    def process_new(self, prompt=True):
        """Processes all new measurements

        Updates name index with the new entries now found in the name index previously.

        Returns:
            None
        """
        for false_name, rigs_and_file_paths in self.urls.items():
            for rig, file_paths in rigs_and_file_paths.items():
                try:
                    ni = self.name_index.find(false_name=false_name)
                    item = ni.items[0] if ni else None

                    if item and item.form == 'ignore':
                        continue

                    # TODO: Infer form from the file path
                    file_paths = [os.path.abspath(p) for p in file_paths]
                    if rig == 'gras':
                        form = 'onear'
                        target_dir = os.path.join(DIR_PATH, 'data', 'onear',
                                                  'GRAS 43AG-7')
                    elif rig == 'legacy':
                        form = 'onear'
                        target_dir = os.path.join(DIR_PATH, 'data', 'onear',
                                                  'Ears-711')
                    else:
                        form = None
                        target_dir = os.path.join(DIR_PATH, 'data', 'inear')

                    if item and item.true_name:
                        # Name index contains the entry
                        if not self.existing.find(true_name=item.true_name):
                            # Doesn't exist yet
                            if form is not None:
                                item.form = form
                            self.process(item,
                                         file_paths,
                                         target_dir=target_dir)

                    else:
                        # Unknown item
                        if prompt:
                            # Prompt true name and form
                            print(f'\n"{false_name}" is not known.')
                            item = self.prompt(false_name, form=form)
                            if item is None:
                                self.name_index.update(NameItem(
                                    false_name, None, 'ignore'),
                                                       false_name=false_name)
                                continue
                            self.name_index.update(item, false_name=false_name)
                            self.process(item,
                                         file_paths,
                                         target_dir=target_dir)
                        else:
                            print(
                                f'"{false_name}" is not known. Add true name and form to name index and run again.'
                            )
                            self.name_index.update(NameItem(
                                false_name, None, None),
                                                   false_name=false_name)
                        self.write_name_index()
                except Exception as err:
                    print(f'Processing failed for "{false_name}"')
                    raise err