def format_as_search_query(text, broad=True):
    processed_text = grammar_matcher.StringProcessor(text)
    category_list = EVENT_TYPES.copy()
    category_list.update(BROAD_STYLES if broad else STYLES)
    for category, rule in category_list.iteritems():
        replaced, count = processed_text.replace_with(rule, lambda x: ' categories:%s ' % category.categories_name)
    return processed_text.text
Пример #2
0
    def _parse_contents(self, response):
        # Wix pages aren't really parseable, so anytime we see them,
        # let's re-run it (depth-1) with an escaped-fragment to get the real html source
        if 'https://static.wixstatic.com/' in response.body and '_escaped_fragment_' not in response.url:
            parsed_url = urlparse(response.url)
            qs = parse_qs(parsed_url.query)
            qs['_escaped_fragment_'] = ''
            wix_scrapeable_url = urlunparse(
                (parsed_url.scheme, parsed_url.netloc, parsed_url.path, parsed_url.params, urlencode(qs), parsed_url.fragment)
            )
            response.meta['depth'] -= 1
            return [scrapy.Request(wix_scrapeable_url, self.parse)]

        return
        if not hasattr(response, 'selector'):
            logging.info('Skipping unknown file from: %s', response.url)
            return
        # Get all text contents of tags (unless they are script or style tags)
        text_contents = ' '.join(response.selector.xpath('//*[not(self::script|self::style)]/text()').extract()).lower()

        processed_text = grammar_matcher.StringProcessor(text_contents, regex_keywords.WORD_BOUNDARIES)
        wrong = processed_text.get_tokens(all_styles.DANCE_WRONG_STYLE)
        good = processed_text.get_tokens(rules.STREET_STYLE)
        if (wrong or good):
            #print response.url, set(wrong), set(good)
            pass
Пример #3
0
    def has_strong_organizer(self):
        title_is_other_dance = self._title_has_other()
        if title_is_other_dance:
            return False

        org_name = self._classified_event.fb_event['info'].get(
            'owner', {}).get('name', '').lower()
        sp = grammar_matcher.StringProcessor(org_name)

        has_dance_organizer = sp.has_token(self.GOOD_DANCE_FULL)
        self._log('Searching organizer (%s) for %s, has: %s', org_name,
                  self.GOOD_DANCE_FULL.name(), has_dance_organizer)
        if has_dance_organizer:
            self._log('Has good dance in event organizer: %s' %
                      has_dance_organizer)
            return 'Has good dance in event organizer'

        has_super_strong_dance_organizer = sp.has_token(
            self.SUPER_STRONG_KEYWORDS)
        self._log('Searching organizer (%s) for %s, has: %s', org_name,
                  self.SUPER_STRONG_KEYWORDS.name(), has_dance_organizer)
        if has_super_strong_dance_organizer:
            self._log('Has super-strong dance in event organizer: %s' %
                      has_dance_organizer)
            return 'Has super-strong dance in event organizer'
        return False
Пример #4
0
def process_doc(fb_event):
    values = array.array(str("f"))
    processed_title = grammar_matcher.StringProcessor(fb_event['info'].get(
        'name', '').lower())
    processed_text = grammar_matcher.StringProcessor(fb_event['info'].get(
        'description', '').lower())
    dummy, title_word_count = re.subn(r'\w+', '', processed_title.text)
    dummy, text_word_count = re.subn(r'\w+', '', processed_text.text)
    values.append(title_word_count)
    values.append(text_word_count)
    # TODO: Ideally we want this to be the rules_list of the GrammarFeatureVector
    for i, (name, rule) in enumerate(named_rules_list):
        title_matches = 1.0 * processed_title.count_tokens(rule)
        text_matches = 1.0 * processed_text.count_tokens(rule)
        values.append(title_matches)
        values.append(text_matches)
    return values
def highlight_keywords(text):
    import jinja2
    processed_text = grammar_matcher.StringProcessor(
        jinja2.Markup.escape(text))
    processed_text.replace_with(
        rules.ANY_GOOD,
        lambda match: jinja2.Markup('<span class="matched-text">%s</span>'
                                    ) % match.group(0),
        flags=re.I)
    return jinja2.Markup(processed_text.get_tokenized_text())
Пример #6
0
    def has_list_of_good_classes(self):
        # A "list of times with dance/music things" can often be clubs as well as classes

        # So let's try to throw out club-things first
        start_time = self._classified_event.start_time
        end_time = self._classified_event.end_time
        # Ignore club events (ends in the morning and less than 12 hours long)
        if end_time and end_time.time() < datetime.time(
                12) and end_time - start_time < datetime.timedelta(hours=12):
            return False

        if len(set(self._get(keywords.CLUB_ONLY))) > 2:
            return False
        #if self._title_has_other():
        #    return False

        # if title is good strong keyword, and we have a list of classes:
        # why doesn't this get found by the is_workshop title classifier? where is our 'camp' keyword
        # https://www.dancedeets.com/events/admin_edit?event_id=317006008387038

        schedule_groups = event_structure.get_schedule_line_groups(
            self._classified_event)
        for schedule_lines in schedule_groups:
            good_lines = []
            bad_lines = []
            for line in schedule_lines:
                proc_line = grammar_matcher.StringProcessor(
                    line, self._classified_event.boundaries)
                good_matches = proc_line.get_tokens(
                    self.GOOD_OR_AMBIGUOUS_DANCE)

                bad_matches = set()
                for x in self.OTHER_DANCES:
                    bad_matches.update(proc_line.get_tokens(x))

                # Sometimes we have a schedule with hiphop and ballet
                # Sometimes we have a schedule with hiphop and dj and beatbox/rap (more on music side)
                # Sometimes we have a schedule with hiphop, house, and beatbox (legit, crosses boundaries)
                # TODO: Should do a better job of classifying the ambiguous music/dance types, based on the presence of non-ambiguous dance types too
                if good_matches and not bad_matches:
                    self._log('Found %s in line', good_matches)
                    good_lines.append(good_matches)
                if not good_matches and bad_matches:
                    bad_lines.append(bad_matches)
            num_dance_lines = len(good_lines) + len(bad_lines)
            self._log('Found %s of %s lines with dance styles: %s',
                      num_dance_lines, len(schedule_lines),
                      good_lines + bad_lines)
            # If more than 10% are good, then we found a good class
            self._log('Found %s of %s lines with good styles: %s',
                      len(good_lines), len(schedule_lines), good_lines)
            if len(good_lines
                   ) > len(schedule_lines) / 10 and num_dance_lines >= 2:
                return 'found schedule list with good styles'
        return False
    def classify(self):
        #self.language not in ['ja', 'ko', 'zh-CN', 'zh-TW', 'th']:
        if cjk_detect.cjk_regex.search(self.search_text):
            cjk_chars = len(cjk_detect.cjk_regex.findall(self.search_text))
            if 1.0 * cjk_chars / len(self.search_text) > 0.05:
                self.boundaries = regex_keywords.NO_WORD_BOUNDARIES
            else:
                self.boundaries = regex_keywords.WORD_BOUNDARIES
        else:
            self.boundaries = regex_keywords.WORD_BOUNDARIES

        self.processed_title = grammar_matcher.StringProcessor(
            self.title, self.boundaries)
        self.processed_text = grammar_matcher.StringProcessor(
            self.search_text, self.boundaries)

        # This must be first, to remove the fake keywords
        self.processed_title.real_tokenize(keywords.PREPROCESS_REMOVAL)
        self.processed_text.real_tokenize(keywords.PREPROCESS_REMOVAL)

        global_rule = styles.PREPROCESS_REMOVAL.get(None)
        per_language_rule = styles.PREPROCESS_REMOVAL.get(self.language)
        if global_rule:
            self.processed_title.real_tokenize(global_rule)
            self.processed_text.real_tokenize(global_rule)
        if per_language_rule:
            self.processed_title.real_tokenize(per_language_rule)
            self.processed_text.real_tokenize(per_language_rule)

        # Or if there are bad keywords, lets see if we can find good keywords on a short line
        short_lines = [
            line
            for line in self.processed_text.get_tokenized_text().split('\n')
            if len(line) < 500
        ]
        self.processed_short_lines = grammar_matcher.StringProcessor(
            '\n'.join(short_lines), self.boundaries)
def find_rules_in_text(text, rule_dict):
    # Eliminate all competitors, before trying to determine the style
    no_competitors_text = event_structure.find_competitor_list(text)
    if no_competitors_text:
        text = text.replace(no_competitors_text, '')
    found_styles = {}
    # Only grab the first 400 lines
    trimmed_text = '\n'.join(text.lower().split('\n')[:400])
    processed_text = grammar_matcher.StringProcessor(trimmed_text)
    processed_text.real_tokenize(keywords.PREPROCESS_REMOVAL)
    # so we can match this with vogue, but not with house
    processed_text.real_tokenize(keywords.HOUSE_OF)
    for style, rule in rule_dict.iteritems():
        tokens = processed_text.get_tokens(rule)
        if tokens:
            found_styles[style] = tokens
    return found_styles.keys()
Пример #9
0
    def _compute_features(self, raw_documents):

        values = array.array(str("f"))
        print "Preloading regexes"
        dummy_processor = grammar_matcher.StringProcessor('')
        for name, rule in named_rules_list:
            dummy_processor.count_tokens(rule)

        print "Computing Features"
        result = Parallel(n_jobs=7 if process_all else 1,
                          verbose=10)(delayed(process_doc)(fb_event)
                                      for event_id, fb_event in raw_documents)
        for row_values in result:
            values.extend(row_values)

        X = np.array(values)
        X.shape = (len(raw_documents), len(self.features))

        return X
Пример #10
0
    def parse_classes(self, response):
        table = response.css('table')

        date = None  # Keep track of this row-to-row
        for row in table.css('tr'):
            cells = row.css('td')
            if not cells:
                continue

            row_contents = self._extract_text(row)
            if not row_contents or '---' in row_contents:
                continue

            potential_day = self._extract_text(cells[0])
            if potential_day:
                date = dateparser.parse(potential_day).date()
            times = self._extract_text(cells[1])
            classname = self._extract_text(cells[2])

            if not times:
                continue

            teacher = self._extract_text(cells[3])
            href_cell = cells[3].xpath('.//@href').extract()

            # Use our NLP event classification keywords to figure out which BDC classes to keep
            processor = grammar_matcher.StringProcessor(classname)
            if not processor.has_token(rules.DANCE_STYLE):
                continue

            item = items.StudioClass()
            item['style'] = classname
            item['teacher'] = teacher
            if href_cell:
                item['teacher_link'] = href_cell[0].strip()
            # do we care?? row[4]
            start_time, end_time = parse_times(self._cleanup(times))
            item['start_time'] = datetime.datetime.combine(date, start_time)
            item['end_time'] = datetime.datetime.combine(date, end_time)
            for new_item in self._repeated_items_iterator(item):
                yield new_item
    def runTest(self):
        string_processor = grammar_matcher.StringProcessor(
            u'the blocking dance')
        self.assertFalse(string_processor.get_tokens(keywords.STYLE_LOCK))

        string_processor = grammar_matcher.StringProcessor(
            u'the locking dance')
        self.assertTrue(string_processor.get_tokens(keywords.STYLE_LOCK))

        string_processor = grammar_matcher.StringProcessor(u'今日はblockingです')
        self.assertFalse(string_processor.get_tokens(keywords.STYLE_LOCK))

        string_processor = grammar_matcher.StringProcessor(u'今日はlockingです')
        self.assertTrue(string_processor.get_tokens(keywords.STYLE_LOCK))

        string_processor = grammar_matcher.StringProcessor(u'今日はロックイングです')
        self.assertTrue(string_processor.get_tokens(keywords.STYLE_LOCK))

        string_processor = grammar_matcher.StringProcessor(u'今日はブロックイングです')
        # Ideally we'd like this to return false,
        # but word segmentation is near-impossible with cjk (and japanese katakana phrases)
        self.assertTrue(string_processor.get_tokens(keywords.STYLE_LOCK))
def relevant_keywords(event):
    text = get_relevant_text(event)
    processed_text = grammar_matcher.StringProcessor(text)
    good_keywords = processed_text.get_tokens(rules.ANY_GOOD)
    return sorted(set(good_keywords))
    def classify(self):
        super(ClassifiedEvent, self).classify()

        # Running real_tokenize() on a rule replaces it with the name of the high-level rule.
        # Instead, let's grab the contents of the rule, which will assume is an Any(), and run on each of the Any()
        manual_dancer = rules.MANUAL_DANCER[grammar.STRONG]
        assert isinstance(manual_dancer, grammar.Name)
        assert isinstance(manual_dancer, grammar.Name)
        assert len(manual_dancer.children()) == 1
        assert isinstance(manual_dancer.children()[0], rules.Any)
        manual_dancer_children = manual_dancer.children()[0].children()
        for rule in manual_dancer_children:
            self.processed_text.real_tokenize(rule)

        self.processed_text.real_tokenize(keywords.GOOD_INSTANCE_OF_BAD_CLUB)
        #TODO(lambert): These grab things that are good, and keep them good, so they can't be stolen by other things.
        # Removing them appears to drop us from 9132 true positives down to 9108 true positives.
        # Maybe we can investigate exactly what's going on, and reduce the number of real_tokenize calls needed?
        self.processed_text.real_tokenize(keywords.DANCE)
        self.processed_text.real_tokenize(keywords.STYLE_BREAK)
        self.processed_text.real_tokenize(keywords.STYLE_ROCK)
        self.processed_text.real_tokenize(keywords.STYLE_POP)
        self.processed_text.real_tokenize(keywords.STYLE_LOCK)
        self.processed_text.real_tokenize(keywords.STYLE_WAACK)
        self.processed_text.real_tokenize(keywords.STYLE_HIPHOP)
        self.processed_text.real_tokenize(keywords.STYLE_HOUSE)
        self.processed_text.real_tokenize(keywords.STYLE_DANCEHALL)
        self.processed_text.real_tokenize(keywords.STYLE_KRUMP)
        self.processed_text.real_tokenize(keywords.STYLE_TURF)
        self.processed_text.real_tokenize(keywords.STYLE_LITEFEET)
        self.processed_text.real_tokenize(keywords.STYLE_FLEX)
        self.processed_text.real_tokenize(keywords.STYLE_BEBOP)
        self.processed_text.real_tokenize(keywords.STYLE_ALLSTYLE)

        search_text = self.processed_text.get_tokenized_text()

        # Or if there are bad keywords, lets see if we can find good keywords on a short line
        short_lines = [
            line for line in search_text.split('\n') if len(line) < 500
        ]
        self.processed_short_lines = grammar_matcher.StringProcessor(
            '\n'.join(short_lines), self.boundaries)

        #if not self.processed_text.get_tokens(rules.ANY_GOOD):
        #    self.dance_event = False
        #    return
        a = time.time()
        b = time.time()
        self.manual_dance_keywords_matches = self.processed_text.get_tokens(
            rules.MANUAL_DANCE[grammar.STRONG])
        self.times['manual_regex'] = time.time() - b
        self.real_dance_matches = self.processed_text.get_tokens(
            rules.GOOD_DANCE)
        if self.processed_text.get_tokens(dance_keywords.ROMANCE):
            event_matches = self.processed_text.get_tokens(
                rules.EVENT_WITH_ROMANCE_EVENT)
        else:
            event_matches = self.processed_text.get_tokens(rules.EVENT)
        club_and_event_matches = self.processed_text.get_tokens(
            dance_keywords.PRACTICE, dance_keywords.PERFORMANCE,
            dance_keywords.CONTEST)
        self.times['all_regexes'] = time.time() - a

        self.found_dance_matches = self.real_dance_matches + self.processed_text.get_tokens(
            dance_keywords.EASY_DANCE, keywords.AMBIGUOUS_DANCE_MUSIC,
            dance_keywords.EASY_CHOREO, keywords.HOUSE,
            keywords.TOO_EASY_VOGUE,
            keywords.EASY_VOGUE) + self.manual_dance_keywords_matches
        self.found_event_matches = event_matches + self.processed_text.get_tokens(
            keywords.EASY_EVENT, keywords.JAM) + club_and_event_matches
        self.found_wrong_matches = self.processed_text.get_tokens(
            all_styles.DANCE_WRONG_STYLE) + self.processed_text.get_tokens(
                keywords.CLUB_ONLY)

        title_wrong_style_matches = self.processed_title.get_tokens(
            all_styles.DANCE_WRONG_STYLE_TITLE)
        title_good_matches = self.processed_title.get_tokens(rules.ANY_GOOD)
        combined_matches_string = ' '.join(self.found_dance_matches +
                                           self.found_event_matches)
        dummy, combined_matches = re.subn(r'\w+', '', combined_matches_string)
        dummy, words = re.subn(r'\w+', '',
                               re.sub(r'\bhttp.*?\s', '', search_text))
        fraction_matched = 1.0 * (combined_matches + 1) / (words + 1)
        if not fraction_matched:
            self.calc_inverse_keyword_density = 100
        else:
            self.calc_inverse_keyword_density = -math.log(fraction_matched, 2)

        #print self.processed_text.count_tokens(dance_keywords.EASY_DANCE)
        #print len(club_and_event_matches)
        #print self.processed_text.count_tokens(all_styles.DANCE_WRONG_STYLE)
        #print self.processed_text.count_tokens(keywords.CLUB_ONLY)
        #strong = 0
        #for line in search_text.split('\n'):
        #   proc_line = f(line)
        #    matches = proc_line.get_tokens(rules.ANY_GOOD)
        #    good_parts = sum(len(x) for x in matches)
        #    if 1.0 * good_parts / len(line) > 0.1:
        #        # strong!
        #        strong += 1
        music_or_dance_keywords = self.processed_text.count_tokens(
            keywords.AMBIGUOUS_DANCE_MUSIC) + self.processed_text.count_tokens(
                keywords.HOUSE)
        if len(self.manual_dance_keywords_matches) >= 1:
            self.dance_event = 'obvious dancer or dance crew or battle'
        # one critical dance keyword
        elif len(self.real_dance_matches) >= 1:
            self.dance_event = 'obvious dance style'
        # If the title has a bad-style and no good-styles, mark it bad
        elif (
                title_wrong_style_matches
                and not (self.processed_title.get_tokens(
                    keywords.AMBIGUOUS_DANCE_MUSIC)
                         or self.manual_dance_keywords_matches
                         or self.real_dance_matches)
        ):  # these two are implied by the above, but do it here just in case future clause re-ordering occurs
            self.dance_event = False

        elif music_or_dance_keywords >= 1 and (
                len(event_matches) +
                self.processed_text.count_tokens(dance_keywords.EASY_CHOREO)
        ) >= 1 and self.calc_inverse_keyword_density < 5 and not (
                title_wrong_style_matches and not title_good_matches):
            self.dance_event = 'hiphop/funk and good event type'
        # one critical event and a basic dance keyword and not a wrong-dance-style and not a generic-club
        elif self.processed_text.count_tokens(
                dance_keywords.EASY_DANCE) >= 1 and (
                    len(event_matches) + self.processed_text.count_tokens(
                        dance_keywords.EASY_CHOREO)
                ) >= 1 and not self.processed_text.count_tokens(
                    all_styles.DANCE_WRONG_STYLE
                ) and self.calc_inverse_keyword_density < 5:
            self.dance_event = 'dance event thats not a bad-style'
        elif self.processed_text.count_tokens(
                dance_keywords.EASY_DANCE) >= 1 and len(
                    self.found_event_matches
                ) >= 1 and not self.processed_text.count_tokens(
                    all_styles.DANCE_WRONG_STYLE
                ) and self.processed_text.count_tokens(
                    keywords.CLUB_ONLY) == 0:
            self.dance_event = 'dance show thats not a club'
        elif music_or_dance_keywords >= 1 and self.processed_text.count_tokens(
                dance_keywords.EASY_DANCE) >= 1:
            self.dance_event = 'good music and dance keyword'
        else:
            self.dance_event = False
        self.times['all_match'] = time.time() - a
Пример #14
0
 def _street_style(style):
     # Use our NLP event classification keywords to figure out which BDC classes to keep
     processor = grammar_matcher.StringProcessor(style)
     # Get rid of "Ballet with Pop Music"
     processor.real_tokenize(keywords.PREPROCESS_REMOVAL)
     return processor.has_token(rules.DANCE_STYLE)
 def notMatchRule(self, rule, s):
     string_processor = grammar_matcher.StringProcessor(s)
     self.assertFalse(string_processor.get_tokens(rule))