def write_hesuvi_zip(): manufacturers = ManufacturerIndex() zip_object = ZipFile(os.path.join(DIR_PATH, 'hesuvi.zip'), 'w') dir_paths = [ os.path.join(DIR_PATH, 'oratory1990'), os.path.join(DIR_PATH, 'crinacle', 'gras_43ag-7_harman_over-ear_2018'), os.path.join(DIR_PATH, 'crinacle', 'ears-711_harman_over-ear_2018'), os.path.join(DIR_PATH, 'innerfidelity'), os.path.join(DIR_PATH, 'rtings'), os.path.join(DIR_PATH, 'headphonecom'), os.path.join(DIR_PATH, 'referenceaudioanalyzer'), ] zip_files = set() for dir_path in dir_paths: for fp in glob(os.path.join(dir_path, '**', '* GraphicEQ.txt'), recursive=True): _, name = os.path.split(fp) name = name.replace(' GraphicEQ.txt', '') if re.search(MOD_REGEX, name, flags=re.IGNORECASE): # Skip samples, there are averaged results available continue manufacturer, _ = manufacturers.find(name) if manufacturer is None: print(f'Manufacturer could not be found for {name}') continue name = manufacturers.model(name) arcname = f'eq/{manufacturer}/{name}.txt' if arcname in zip_files: # Skip duplicates continue with open(fp, 'r', encoding='utf-8') as fh: s = fh.read() data = np.array( [x.split() for x in s.split(': ')[1].split('; ')], dtype='float') sl = np.logical_and(data[:, 0] > 100, data[:, 0] < 10000) data[:, 1] -= np.mean(data[sl, 1]) s = 'GraphicEQ: ' s += '; '.join([f'{x[0]:.0f} {x[1]:.1f}' for x in data]) zip_object.writestr(arcname, s) zip_files.add(arcname) zip_object.close()
def write_hesuvi_index(): os.makedirs(os.path.join(DIR_PATH, 'hesuvi'), exist_ok=True) manufacturers = ManufacturerIndex() zip_object = ZipFile(os.path.join(DIR_PATH, 'hesuvi.zip'), 'w') dir_paths = [ os.path.join(DIR_PATH, 'oratory1990'), os.path.join(DIR_PATH, 'crinacle', 'harman_in-ear_2019v2'), os.path.join(DIR_PATH, 'crinacle', 'crinacl_over-ear'), os.path.join(DIR_PATH, 'innerfidelity'), os.path.join(DIR_PATH, 'rtings'), os.path.join(DIR_PATH, 'headphonecom'), ] zip_files = set() for dir_path in dir_paths: for fp in glob(os.path.join(dir_path, '**', '* GraphicEQ.txt'), recursive=True): _, name = os.path.split(fp) name = name.replace(' GraphicEQ.txt', '') if re.search(r' \(?(sample |sn)[a-zA-Z0-9]+\)?$', name, flags=re.IGNORECASE): # Skip samples, there are averaged results available continue manufacturer, _ = manufacturers.find(name) name = manufacturers.model(name) arcname = f'eq/{manufacturer}/{name}.txt' if arcname in zip_files: # Skip duplicates continue with open(fp, 'r', encoding='utf-8') as fh: s = fh.read() data = np.array( [x.split() for x in s.split(': ')[1].split('; ')], dtype='float') sl = np.logical_and(data[:, 0] > 100, data[:, 0] < 10000) data[:, 1] -= np.mean(data[sl, 1]) s = 'GraphicEQ: ' s += '; '.join([f'{x[0]:.0f} {x[1]:.1f}' for x in data]) zip_object.writestr(arcname, s) zip_files.add(arcname) zip_object.close()
class Crawler(ABC): def __init__(self, driver=None): self.driver = driver self.name_index = self.read_name_index() self.manufacturers = ManufacturerIndex() self.name_proposals = None self.init_name_proposals() self.existing = self.get_existing() self.urls = self.get_urls() # UI self.prompts = widgets.VBox([]) self.iframe = widgets.VBox([]) self.widget = widgets.HBox([self.prompts, self.iframe]) @staticmethod @abstractmethod def read_name_index(): """Reads name index as Index Returns: NameIndex """ pass def init_name_proposals(self): """Gets name proposals for new measurements Returns: NameIndex """ name_proposals = NameIndex() for db in [ 'crinacle', 'oratory1990', 'rtings', 'referenceaudioanalyzer' ]: name_index = NameIndex.read_tsv( os.path.join(DIR_PATH, db, 'name_index.tsv')) name_proposals.concat(name_index) for db in ['innerfidelity', 'headphonecom']: name_index = NameIndex.read_files( os.path.join(DIR_PATH, db, 'data', '**', '*.csv')) name_proposals.concat(name_index) name_proposals.remove_duplicates() manufacturer_pattern = rf'^({"|".join([m[0] for m in self.manufacturers.manufacturers])})' proposal_data = {'form': [], 'manufacturer': [], 'model': []} for item in name_proposals.items: if not item.true_name or item.form == 'ignore': continue manufacturer = re.search(manufacturer_pattern, item.true_name, flags=re.IGNORECASE) if not manufacturer: continue manufacturer = manufacturer[0] proposal_data['form'].append(item.form) proposal_data['manufacturer'].append(manufacturer) proposal_data['model'].append( item.true_name.replace(manufacturer, '').strip()) self.name_proposals = pd.DataFrame(proposal_data) @abstractmethod def write_name_index(self): """Writes name index to a file Returns: Index """ pass @staticmethod @abstractmethod def get_existing(): """Reads existing files as Index Returns: Index """ pass @abstractmethod def get_urls(self): """Crawls measurement URLs Returns: Dict where headphone names are keys and URLs are values """ pass @abstractmethod def process(self, item, url): """Downloads a single URL and processes it Args: item: Item url: URL to measurement Returns: None """ pass def update_name_index(self, item): """Updates name index""" exact_match = self.name_index.find_one(false_name=item.false_name, true_name=item.true_name, form=item.form) if not exact_match: self.name_index.update(item, false_name=item.false_name) self.write_name_index() def prompt_callback(self, false_name, url): def callback(true_name, form): if form == 'ignore': self.update_name_index(NameItem(false_name, None, form)) return item = NameItem(false_name, true_name, form) try: self.process(NameItem(false_name, true_name, form), url) except FileNotFoundError as err: print(err) return self.update_name_index(item) return callback def process_new(self, prompt=True): """Processes all new measurements Updates name index with the new entries now found in the name index previously. Returns: None """ prompts = [] unknown_manufacturers = [] for false_name, url in self.urls.items(): item = self.name_index.find_one(false_name=false_name) if item and item.form == 'ignore': continue if not item: if not prompt: print( f'{false_name} is unknown and prompting is prohibited, skipping the item.' ) continue # Name doesn't exist in the name index intermediate_name = self.intermediate_name(false_name) manufacturer, manufacturer_match = self.manufacturers.find( intermediate_name) if manufacturer: model = re.sub(re.escape(manufacturer_match), '', intermediate_name, flags=re.IGNORECASE).strip() name_proposals = self.get_name_proposals(false_name) similar_names = self.get_name_proposals( false_name, n=6, normalize_digits=True, threshold=0) similar_names = [ item.true_name for item in similar_names.items ] else: unknown_manufacturers.append(intermediate_name) model = intermediate_name name_proposals = None similar_names = None # Not sure about the name, ask user prompts.append( NamePrompt(model, self.prompt_callback(false_name, url), manufacturer=manufacturer, name_proposals=name_proposals, search_callback=self.search, false_name=false_name, similar_names=similar_names).widget) else: existing = self.existing.find_one(true_name=item.true_name) if not existing: # Name found in name index but the measurement doesn't exist self.process(item, url) if len(unknown_manufacturers) > 0: print('Headphones with unknown manufacturers\n ' + '\n '.join(unknown_manufacturers)) print('Add them to manufacturers.tsv and run this cell again') self.prompts.children = prompts def search(self, name): quoted = urllib.parse.quote_plus(name) url = f'https://google.com/search?q={quoted}&tbm=isch' webbrowser.open(url) def get_name_proposals(self, false_name, n=4, normalize_digits=False, normalize_extras=False, threshold=60): """Prompts manufacturer, model and form from the user Args: false_name: Name as it exists in the measurement source n: Number of proposals to return normalize_digits: Normalize all digits to zeros before calculating fuzzy string matching score normalize_extras: Remove extra details in the parentheses threshold: Score threshold Returns: NameItem """ def fuzzy(fn, a, b): a = a.lower() b = b.lower() if normalize_digits: a = re.sub(r'\d', '0', a).strip() b = re.sub(r'\d', '0', b).strip() if normalize_extras: a = re.sub(r'\(.+\)$', '', a).strip() b = re.sub(r'\(.+\)$', '', b).strip() return fn(a, b) manufacturer, manufacturer_match = self.manufacturers.find(false_name) if not manufacturer: return NameIndex([]) false_model = re.sub(re.escape(manufacturer_match), '', false_name, flags=re.IGNORECASE).strip() # Select only the items with the same manufacturer models = self.name_proposals[self.name_proposals.manufacturer == manufacturer] # Calculate ratios partial_ratios = [ fuzzy(fuzz.partial_ratio, model, false_model) for model in models.model.tolist() ] ratios = [ fuzzy(fuzz.ratio, model, false_model) for model in models.model.tolist() ] models = models.assign(partial_ratio=partial_ratios) models = models.assign(ratio=ratios) models = models[models.partial_ratio >= threshold] models.sort_values('ratio', ascending=False, inplace=True) proposals = [] for i, row in models.iterrows(): proposals.append( NameItem(None, f'{manufacturer} {row.model}', row.form)) ni = NameIndex(items=proposals) ni.df = ni.df.head(n) return ni def intermediate_name(self, false_name): """Gets intermediate name with false name.""" return false_name @staticmethod def download(url, true_name, output_dir, file_type=None): """Downloads a file from a URL Args: url: URL to download true_name: True name of the item to download output_dir: Where to write the downloaded file file_type: File extension. Detected automatically if None. Returns: Bool depicting if download succeeded or not """ output_dir = os.path.abspath(output_dir) os.makedirs(output_dir, exist_ok=True) res = requests.get(url, stream=True) if res.status_code != 200: print(f'Failed to download "{true_name}" at "{url}"') return None if file_type is None: file_type = url.split('.')[-1] file_type = file_type.split('?')[0] file_path = os.path.join(output_dir, f'{true_name}.{file_type}') with open(file_path, 'wb') as f: res.raw.decode_content = True shutil.copyfileobj(res.raw, f) print('Downloaded to "{}"'.format(file_path)) return file_path def get_beautiful_soup(self, url): self.driver.get(url) sleep(1) # Giving some time for Selenium to render the page html = self.driver.find_element_by_tag_name('html').get_attribute( 'outerHTML') return BeautifulSoup(html, 'html.parser')
class Crawler(ABC): def __init__(self, driver=None): self.driver = driver self.name_index = self.read_name_index() self.name_proposals = self.get_name_proposals() self.existing = self.get_existing() self.urls = self.get_urls() self.manufacturers = ManufacturerIndex() @staticmethod @abstractmethod def read_name_index(): """Reads name index as Index Returns: NameIndex """ pass def get_name_proposals(self): """Gets name proposals for new measurements Returns: NameIndex """ name_proposals = NameIndex() for db in [ 'crinacle', 'oratory1990', 'rtings', 'referenceaudioanalyzer' ]: name_index = NameIndex.read_tsv( os.path.join(DIR_PATH, db, 'name_index.tsv')) name_proposals.concat(name_index) for db in ['innerfidelity', 'headphonecom']: name_index = NameIndex.read_files( os.path.join(DIR_PATH, db, 'data', '**', '*.csv')) name_proposals.concat(name_index) return name_proposals @abstractmethod def write_name_index(self): """Writes name index to a file Returns: Index """ pass @staticmethod @abstractmethod def get_existing(): """Reads existing files as Index Returns: Index """ pass @abstractmethod def get_urls(self): """Crawls measurement URLs Returns: Dict where headphone names are keys and URLs are values """ pass @abstractmethod def process(self, item, url): """Downloads a single URL and processes it Args: item: Item url: URL to measurement Returns: None """ pass @staticmethod def prompt_true_name(name_options): """Prompts true name from the user.""" name_options = name_options if name_options is not None else [] if 'skip' not in name_options: name_options.insert(0, 'skip') s = 'What is it\'s true name?' if len(name_options): s += ' Select a number or write the name if none of the options.' print(s) if len(name_options): print(f'\n'.join(f'[{i}] {o}' for i, o in enumerate(name_options))) while True: name = input('> ') try: name = name_options[int(name)] if name == 'skip': return None break except (KeyError, ValueError): break except IndexError: print('That didn\'t work, try again.') return name @staticmethod def prompt_manufacturer(name_options): """Prompts true manufacturer from the user.""" name_options = name_options if name_options is not None else [] s = 'What is it\'s true manufacturer name?' if len(name_options): s += ' Select a number or write the name if none of the options.' print(s) if len(name_options): print(f'\n'.join(f'[{i + 1}] {o}' for i, o in enumerate(name_options))) while True: name = input('> ') try: name = name_options[int(name) - 1] break except (KeyError, ValueError): break except IndexError: print('That didn\'t work, try again.') print('Which part of the name to replace') replace = input('> ') return name, replace @staticmethod def prompt_form(): """Prompts form from the user.""" options = ['onear', 'inear', 'earbud'] print('What is it\'s type?') print(f'\n'.join(f'[{i + 1}] {o}' for i, o in enumerate(options))) while True: form = input('> ') try: return options[int(form) - 1] except (IndexError, ValueError): print('That didn\'t work, try again.') def prompt(self, false_name): """Prompts user for true name and form based on false name.""" form = None if self.name_proposals is not None: # Name proposals initialized, add matching entries to options in prompt matches = [] matches += self.name_proposals.search_by_false_name(false_name) matches += self.name_proposals.search_by_true_name(false_name) names_and_ratios = [] for match in matches: if not match[0].true_name: # Skip items without true name continue if match[1] == 100: # Exact match match[0].true_name += ' ✓' if match[0].true_name not in [x[0] for x in names_and_ratios]: # New match names_and_ratios.append( (match[0].true_name, match[1], match[0].form)) else: # Existing match, update ratio for i in range(len(names_and_ratios)): if match[0].true_name == names_and_ratios[i][ 0] and match[1] > names_and_ratios[i][1]: names_and_ratios[i] = (names_and_ratios[i][0], match[1], names_and_ratios[i][2]) name_options = [ x[0] for x in sorted( names_and_ratios, key=lambda x: x[1], reverse=True)[:4] ] if false_name not in name_options: name_options.append(false_name) # Add the false name # Prompt true_name = self.prompt_true_name(name_options) if true_name is None: return None # Find and replace true manufacturer name or prompt it if self.manufacturers.find(true_name)[0] is None: # Unknown manufacturer, find options with the two first words and prompt it manufacturer_options = [] for i in range(1, min(3, len(true_name.split()))): candidate = ' '.join(true_name.split()[:i]) print(candidate) manufacturer_options += self.manufacturers.search( candidate) if candidate not in [x[0] for x in manufacturer_options]: manufacturer_options.append((candidate, 0)) manufacturer_options = sorted(manufacturer_options, key=lambda x: x[1], reverse=True) manufacturer_options = [x[0] for x in manufacturer_options] manufacturer, replace = self.prompt_manufacturer( manufacturer_options) _, match = self.manufacturers.find(manufacturer) if match: # Add as a new variant in existing manufacturer for m in self.manufacturers.manufacturers: if m[0] == match: m.append(replace) else: # Add new manufacturer self.manufacturers.manufacturers.append([manufacturer]) self.manufacturers.write() # Replace true_name = self.manufacturers.replace(true_name) # Find the answer and select form for name, ratio, f in names_and_ratios: if true_name == name: form = f break true_name = true_name.replace(' ✓', '') else: true_name = self.prompt_true_name([false_name]) form = None if true_name is None: # User skipped return None if form is None: # Form not found in name proposals, prompt it form = self.prompt_form() return NameItem(false_name, true_name, form) def process_new(self, prompt=True): """Processes all new measurements Updates name index with the new entries now found in the name index previously. Returns: None """ for false_name, url in self.urls.items(): try: ni = self.name_index.find(false_name=false_name) item = ni.items[0] if ni else None if item and item.form == 'ignore': continue if item and item.true_name: # Name index contains the entry if not self.existing.find(true_name=item.true_name): # Doesn't exist already self.process(item, url) else: # Unknown item if prompt: # Prompt true name and form print(f'\n"{false_name}" is not known.') item = self.prompt(false_name) if item is None: self.name_index.update(NameItem( false_name, None, 'ignore'), false_name=false_name) continue self.name_index.update(item, false_name=false_name) self.process(item, url) else: print( f'"{false_name}" is not known. Add true name and form to name index and run again.' ) self.name_index.update(NameItem( false_name, None, None), false_name=false_name) self.write_name_index() except Exception as err: print(f'Processing failed for "{false_name}"') raise err @staticmethod def download(url, true_name, output_dir, file_type=None): """Downloads a file from a URL Args: url: URL to download true_name: True name of the item to download output_dir: Where to write the downloaded file file_type: File extension. Detected automatically if None. Returns: Bool depicting if download succeeded or not """ output_dir = os.path.abspath(output_dir) os.makedirs(output_dir, exist_ok=True) res = requests.get(url, stream=True) if res.status_code != 200: print(f'Failed to download "{true_name}" at "{url}"') return None if file_type is None: file_type = url.split('.')[-1] file_type = file_type.split('?')[0] file_path = os.path.join(output_dir, '{}.{}'.format(true_name, file_type)) with open(file_path, 'wb') as f: res.raw.decode_content = True shutil.copyfileobj(res.raw, f) print('Downloaded to "{}"'.format(file_path)) return file_path def get_beautiful_soup(self, url): self.driver.get(url) html = self.driver.find_element_by_tag_name('html').get_attribute( 'outerHTML') return BeautifulSoup(html, 'html.parser')