def parse(self, response): texts = response.xpath('//div[@class="emojione-output"]').extract() names = response.xpath('//div[@class="user_in"]/b').extract() dates = response.xpath( '//div[@class="user_in"]//span[@class="date"]/span/text()' ).extract() for i in range(len(texts)): text = re.sub('<blockquote>.*?<\/blockquote>', '', texts[i], flags=re.DOTALL) text = functions.strip_accents(text.strip()) text = functions.clean_text(text) name = re.sub('<[^<]+?>', '', names[i]) name = functions.strip_accents(name) date = functions.process_date_emimino(dates[i].strip().split()[0]) # add to db yield { 'domain': self.domain, 'url': self.start_url, 'text': text, 'date': date, 'name': name }
def parse(self, response): texts = response.xpath( '//div[@class="diskuse-prispevek"]//div[@class="popis"]').extract( ) names = response.xpath( '//div[@class="diskuse-prispevek"]//span[@class="jmeno"]').extract( ) dates = response.xpath( '//div[@class="diskuse-prispevek"]//span[@class="datum"]').extract( ) for i in range(len(texts)): text = texts[i].encode('utf8').replace('<div class="popis">', '').replace('</div>', '') text = functions.strip_accents(text) text = functions.clean_text(text) name = names[i].encode('utf8').replace('<span class="jmeno">', '').replace('</span>', '') name = functions.strip_accents(name) date = dates[i].encode('utf8').replace( '<span class="datum">', '').replace('</span>', '').strip(',').strip().split()[0] date = functions.process_date_abc(date) # add to db yield { 'domain': self.domain, 'url': self.start_url, 'text': text, 'date': date, 'name': name }
def parse(self, response): jokes = response.xpath('//div[@class="joke-content"]').extract() for i in range(len(jokes)): joke = jokes[i].encode('utf8').replace( '<div class="joke-content">', '').replace('</div>', '') joke = functions.strip_accents(joke) joke = functions.clean_text(joke) # add to db yield {'domain': self.domain, 'url': self.start_url, 'joke': joke}
def parse(self, response): texts = response.xpath('//div[@class="emojione-output"]').extract() names = response.xpath('//div[@class="user_in"]').extract() tmp_dates = response.xpath('//span[@class="date"]').extract() dates = [] for date in tmp_dates: if ('přís' not in date): dates.append(date) for i in range(len(texts)): text = texts[i].encode('utf8').replace( '<div class="emojione-output">\n\t\t\t\n<p>', '').replace('</p>\n\n\t\t</div>', '') text = functions.strip_accents(text) text = functions.clean_text(text) if ('komunita' in names[i]): name = re.sub( '<[^<]+?>', '', names[i].encode('utf8').replace( '<div class="user_in">\n \t \n\t\t\t<b>', '')) name = name.split('\n')[0] name = functions.strip_accents(name) else: name = names[i].encode('utf8').replace( '<div class="user_in">', '').replace('<b>', '').split('<a name=')[0].strip() name = functions.strip_accents(name) date = functions.process_date_vitalion( dates[i].encode('utf8').replace('<span class="date"><span> ', '').replace( '</span></span>', '')) # add to db yield { 'domain': self.domain, 'url': self.start_url, 'text': text, 'date': date, 'name': name }
def parse(self, response): texts = response.xpath('//div[@class="field-item even"]').extract() names = response.xpath('//span[@class="username"]').extract() dates = response.xpath('//div[@class="small"]').extract() for i in range(len(texts)): if ('Isaac Asimov' not in texts[i].encode('utf8')): text = texts[i].encode('utf8').replace('<div class="field-item even" property="content:encoded">','').replace('</div>','') text = functions.strip_accents(text) text = functions.clean_text(text) name = names[i].encode('utf8').replace('<span class="username" xml:lang="" typeof="sioc:UserAccount" property="foaf:name" datatype="">','').replace('</span>','') name = functions.strip_accents(name) date = dates[i].encode('utf8').replace('<div class="small"> ','').replace(' </div>','') date = functions.process_date_doktorka(date) # add to db yield { 'domain': self.domain, 'url': self.start_url, 'text': text, 'date': date, 'name': name }
def train(filename): start_time = time.time() unigram_letters = 'training/unigram_letter.json' bigram_letters = 'training/bigram_letter.json' unigram_words = 'training/unigram_words.json' bigram_words = 'training/bigram_words.json' if filename: try: os.remove(unigram_letters) except OSError: pass try: os.remove(bigram_letters) except OSError: pass try: os.remove(unigram_words) except OSError: pass try: os.remove(bigram_words) except OSError: pass else: filename = 'corpus/corpus_sm.txt' s = functions.strip_accents(codecs.open(filename, 'r', encoding='utf-8').read().lower()); words = re.findall(r'\b[a-z]+\b', s) bi_words = re.findall(r'(?=([a-z]+\s+[a-z]+))[a-z]+\s+', s) letters = re.findall(r'[a-z]', s) bi_letters = re.findall(r'(?=([a-z][a-z]))[a-z]', s) words_count = Counter(words) bi_words_count = Counter(bi_words) letters_count = Counter(letters) bi_letters_count = Counter(bi_letters) print("Counters took {0:.2f} seconds".format(time.time() - start_time)) print("---------Training begins") if(not os.path.isfile(unigram_words)): train_unigram_words(words_count, unigram_words); print("") print("---------Unigram words trained") else: print("---------Unigram words already trained") if(not os.path.isfile(bigram_words)): train_bigram_words(bi_words_count, bigram_words); print("") print("---------Bigram words trained") else: print("---------Bigram words already trained") if(not os.path.isfile(unigram_letters)): train_unigram_letters(letters_count, unigram_letters); print("---------Unigram letters trained") else: print("---------Unigram letters already trained") if(not os.path.isfile(bigram_letters)): train_bigram_letters(bi_letters_count, bigram_letters) print("") print("---------Bigram letters trained") else: print("---------Bigram letters already trained") print("---------Trainig finished: it tooks {0:.2f} seconds".format(time.time() - start_time))
def train(filename): start_time = time.time() unigram_letters = 'training/unigram_letter.json' bigram_letters = 'training/bigram_letter.json' unigram_words = 'training/unigram_words.json' bigram_words = 'training/bigram_words.json' if filename: try: os.remove(unigram_letters) except OSError: pass try: os.remove(bigram_letters) except OSError: pass try: os.remove(unigram_words) except OSError: pass try: os.remove(bigram_words) except OSError: pass else: filename = 'corpus/corpus_sm.txt' s = functions.strip_accents( codecs.open(filename, 'r', encoding='utf-8').read().lower()) words = re.findall(r'\b[a-z]+\b', s) bi_words = re.findall(r'(?=([a-z]+\s+[a-z]+))[a-z]+\s+', s) letters = re.findall(r'[a-z]', s) bi_letters = re.findall(r'(?=([a-z][a-z]))[a-z]', s) words_count = Counter(words) bi_words_count = Counter(bi_words) letters_count = Counter(letters) bi_letters_count = Counter(bi_letters) print("Counters took {0:.2f} seconds".format(time.time() - start_time)) print("---------Training begins") if (not os.path.isfile(unigram_words)): train_unigram_words(words_count, unigram_words) print("") print("---------Unigram words trained") else: print("---------Unigram words already trained") if (not os.path.isfile(bigram_words)): train_bigram_words(bi_words_count, bigram_words) print("") print("---------Bigram words trained") else: print("---------Bigram words already trained") if (not os.path.isfile(unigram_letters)): train_unigram_letters(letters_count, unigram_letters) print("---------Unigram letters trained") else: print("---------Unigram letters already trained") if (not os.path.isfile(bigram_letters)): train_bigram_letters(bi_letters_count, bigram_letters) print("") print("---------Bigram letters trained") else: print("---------Bigram letters already trained") print("---------Trainig finished: it tooks {0:.2f} seconds".format( time.time() - start_time))