def process(self): start_day = time.gmtime(time.time()).tm_mday for line in sys.stdin: tweet_text, tid_origin, tid_retweet, timestamp, tweet_json = utils.extract_text(line) simhash_value = Simhash(tweet_text).value if simhash_value in self.related_tweets_hash or tid_origin in self.pushed_tweets_ids or tid_retweet in self.pushed_tweets_ids: continue topic_id, similarity = self.classifier.classify(tweet_text) if topic_id == '': continue tweet_json['similarity'] = similarity evaluate_score = self.ranker.predict(json.dumps(tweet_json)) total_score = similarity * evaluate_score if total_score < 0.15: continue is_pushed = self.pusher.push(evaluate_score, topic_id) if is_pushed: delivery_time = time.time() self.pushed_tweets[topic_id].append([tid_origin, str(delivery_time)[:10], similarity, total_score, tweet_text]) self.pushed_tweets_ids.add(tid_retweet) struct_time = time.gmtime(float(timestamp[:-3])) utc_time = time.strftime('%Y%m%d', struct_time) self.related_tweets[topic_id].append([utc_time, tid_origin, total_score, tweet_text]) self.related_tweets_hash.add(simhash_value) if struct_time.tm_mday != start_day: self.dump_result(start_day) start_day = struct_time.tm_mday
def __init__(self, resume, skills_file=None): self.__skills_file = skills_file self.__matcher = Matcher(nlp.vocab) self.__details = { 'name': None, 'email': None, 'mobile_number': None, 'skills': None, 'college_name': None, 'degree': None, 'designation': None, 'experience': None, 'company_names': None, 'no_of_pages': None, 'total_experience': None, } self.__resume = resume if not isinstance(self.__resume, io.BytesIO): ext = os.path.splitext(self.__resume)[1].split('.')[1] else: ext = self.__resume.name.split('.')[1] self.__text_raw = utils.extract_text(self.__resume, '.' + ext) self.__text = ' '.join(self.__text_raw.split()) self.__nlp = nlp(self.__text) self.__custom_nlp = custom_nlp(self.__text_raw) self.__noun_chunks = list(self.__nlp.noun_chunks) self.__get_basic_details()
def process_json_tweet(t, tweets, retweets): u_id = t['user']['id_str'] urls = utils.expanded_urls_from(t) ot = utils.get_ot_from_rt(t) is_reply = utils.is_reply(t) t_info = { 't_id': t['id_str'], 'u_id': u_id, #t['user']['id_str'], 'u_sn': t['user']['screen_name'], 'u_dn': t['user']['name'], 'u_desc': t['user']['description'], 't_ts_sec': utils.extract_ts_s(t['created_at']), 'hashtags': utils.lowered_hashtags_from(t), 'mentioned_ids': [m['id_str'] for m in utils.mentions_from(t)], 'urls': urls, 'domains': [utils.extract_domain(u, lower=True) for u in urls], 'is_rt': ot != None, 'retweeted_t_id': ot['id_str'] if ot else None, 'retweeted_u_id': ot['user']['id_str'] if ot else None, 'is_reply': is_reply, 'replied_to_t_id': t['in_reply_to_status_id_str'] if is_reply else None, 'replied_to_u_id': t['in_reply_to_user_id_str'] if is_reply else None, 'text': utils.extract_text(t) } if u_id not in tweets: tweets[u_id] = [t_info] else: tweets[u_id].append(t_info) if t_info['is_rt'] and t_info['retweeted_t_id'] not in retweets: retweets[t_info['retweeted_t_id']] = { 'user_id': t_info['retweeted_u_id'], 'rt_text': t_info['text'] }
def get_details(self, resume_name): resume_path = "Resumes/" + resume_name ext = "pdf" nlp = spacy.load('en_core_web_sm') matcher = Matcher(nlp.vocab) text_raw = utils.extract_text(resume_path, '.' + ext) text = ' '.join(text_raw.split()) array = text.split() topics = [] field_list = [ 'OVERVIEW / CAREER OBJECTIVE / SUMMARY', 'KEY EXPERTISE / SKILLS', 'EDUCATION', 'AWARDS AND SCHOLARSHIPS', 'INTERNSHIPS', 'PROJECTS', 'ACHIEVEMENTS', 'SEMINARS / TRAININGS / WORKSHOPS', 'CO-CURRICULAR ACTIVITIES', 'EXTRA CURRICULAR ACTIVITIES', 'PERSONAL INTERESTS / HOBBIES', 'WEB LINKS', 'PERSONAL DETAILS' ] for word in field_list: if (text.find(word) >= 0): topics.append(word) content = {} total_topics = len(topics) for i in range(total_topics - 1): string_to_find = topics[i] + '(.*)' + topics[i + 1] result = re.search(string_to_find, text) content[topics[i]] = result.group(1) temp = topics[total_topics - 1] + '(.*)' temp_res = re.search(temp, text) content[topics[total_topics - 1]] = temp_res.group(1) __full_text = nlp(text) actual_marks = "CGPA: " + '(.*)' + "/ 10.00" cgpa = re.search(actual_marks, content['EDUCATION']) # DOMAIN RANKING rank_text = content['KEY EXPERTISE / SKILLS'] + content['PROJECTS'] project_text = ResumeParse().clean_project(rank_text) file_name = "rank/" + resume_name.split('.')[0] + ".txt" f = open(file_name, "w+") f.write(project_text) f.close() #FOR SKILLS skills = ResumeParse().get_skills(content['KEY EXPERTISE / SKILLS']) # name=utils.extract_name(__full_text,matcher) email = utils.extract_email(text) mobile = utils.extract_mobile_number(text) details = {} # details['name']=name details['email'] = email details['mobile'] = mobile details['skills'] = skills details['cgpa'] = cgpa.group(1) return details, content
def test_add_sortkey(self): t = sortkey(self.page, self.text, 'sv') with open('debug.txt', 'w') as f: f.write(t) assert t == extract_text('tests/page/gå_after_sortkey_sv.txt') # Sortkey already present p = extract_text('tests/page/gå_after_sortkey_sv.txt') t = sortkey(self.page, p, 'sv') assert t == p # Only one section p = extract_text('tests/page/gå_one_section_modified.txt') t = sortkey(self.page, p, 'sv', section='nom') assert t == p # Only danish sortkey p = extract_text('tests/page/gå_after_sortkey_da.txt') t = sortkey(self.page, self.text, 'da') assert t == p # 'da' and 'no' sortkeys only t = self.text p = extract_text('tests/page/gå_after_sortkey_da_no.txt') for x in ('da', 'no'): t = sortkey(self.page, t, x) assert t == p # 'da' and 'no' sortkeys only with non present languages t = self.text p = extract_text('tests/page/gå_after_sortkey_da_no.txt') for x in ('da', 'no', 'de', 'fr'): t = sortkey(self.page, t, x) assert t == p # All sortkeys p = extract_text('tests/page/gå_after_all_sortkeys.txt') for x in self.langs: self.text = sortkey(self.page, self.text, x) assert self.text == p
def test_no_sortkey(self): # Wrong section t = sortkey(self.page, self.text, 'sv', section='dérivés') assert t == self.text # Page which doesn't need a sortkey p = extract_text('tests/page/manifest.txt') self.page.title = MagicMock(return_value='manifest') t = sortkey(self.page, p, 'sv') assert t == p
def test_extract_text(self): class FakeSpider(object): test_tag_xpath = '//p[@id="id1"]' test_tag_xpath_mobile = '//p[@id="mobile_id1"]' spider = FakeSpider() from utils import extract_text, extract_text_null body = """ <p>p1</p> <p id="id1">id1</p> <p id="id2">id2</p> """ resp = TextResponse('http://example.com', body=body) text = extract_text(spider, 'test_tag', resp)[0] self.assertEqual(text, 'id1') text = extract_text_null(spider, 'test_tag', resp)[0] self.assertEqual(text, 'id1') body = """ <p>p1</p> <p id="id1" /> <p id="id2">id2</p> """ resp = TextResponse('http://example.com', body=body) text = extract_text_null(spider, 'test_tag', resp)[0] self.assertEqual(text, None) settings = get_project_settings() for string_mobile_list in settings.get('HTML_MOBILE_STRING'): mobile_body = """ <p>p1</p> <p id="mobile_id1">mobile_id1</p> <p id="mobile_id2">mobile_id2</p> <p>%s</p> """ % (' '.join(string_mobile_list)) resp = TextResponse('http://example.com', body=mobile_body) text = extract_text(spider, 'test_tag', resp)[0] self.assertEqual(text, 'mobile_id1') text = extract_text_null(spider, 'test_tag', resp)[0] self.assertEqual(text, 'mobile_id1')
def parse_details_page(self, response): page = response.url.split("/")[-1] filename = 'data/processed/laptoplk/%s.json' % page url = response.url title = response.selector.xpath().extract_first() summary = response.selector.xpath().extract_first() summary = utils.extract_text(summary) catogory = response.selector.xpath().extract_first() model_id = response.selector.xpath( '//li/b[contains(text(),"Model")]/following-sibling::strong/text()' ).extract_first() brand = model_id.split(" ")[0] specs = response.selector.xpath().extract_first() specs = utils.extract_text(specs) price = response.selector.xpath( '//b[contains(text(),"LKR")]/text()').extract_first() price = utils.clean_price(price) curr_page_data = {} curr_page_data["url"] = url curr_page_data["title"] = title curr_page_data["summary"] = summary curr_page_data["catogory"] = catogory curr_page_data["brand"] = brand curr_page_data["model_id"] = model_id curr_page_data["specs"] = specs curr_page_data["price"] = price curr_page_data["vendor"] = "laptop.lk" with open(filename, 'w') as fp: json.dump(curr_page_data, fp) # with open(filename, 'wb') as f: # details = response.xpath('//div[@class="Pro"]').extract() # print details # f.write(str(details)) self.log('Saved file %s' % filename)
def parse_details_page(self, response): url = response.url title = response.selector.xpath('//div[@class="product-name"]/h1/text()').extract_first() summary = response.selector.xpath('//div[@class="short-description"]').extract_first() specs = response.selector.xpath('//div[@id="product_tabs_description_tabbed_contents"]').extract_first() price = response.selector.xpath('//div[@class="product-shop"]//span[contains(@class,"price")]/text()').extract()[1] price = utils.clean_price(price) model_id = response.selector.xpath('/html/head/meta[@name="keywords"]/@content').extract_first() # if(price is None): # price = response.selector.xpath('//span[@class="special-price"]/text()').extract_first() url_components = url.split("/") page = url_components[-1][:-5] filename = 'data/processed/metropoliton/%s.json' % page catogory = url_components[-2] brand = url_components[-1].split("-")[0] specs = utils.extract_text(specs) summary = utils.extract_text(summary) curr_page_data = {} curr_page_data["url"] = url curr_page_data["title"] = title curr_page_data["summary"] = summary curr_page_data["catogory"] = catogory curr_page_data["brand"] = brand curr_page_data["model_id"] = model_id curr_page_data["specs"] = specs curr_page_data["price"] = price curr_page_data["vendor"] = "metropoliton" # with open(filename, 'w') as fp: # json.dump(currPageData, fp) yield curr_page_data
def parse_details_page(self, response): url = response.url title = response.selector.xpath('//title/text()').extract_first() summary = response.selector.xpath( '//div[@class="product-view-area"]//form/ul').extract_first() summary = utils.extract_text(summary) catogory = response.selector.xpath( '//div[@class="product-view-area"]//form/ul/li[contains(text(),"Categories")]/a/text()' ).extract_first() brand = response.selector.xpath( '//div[@class="product-view-area"]//form/ul/li[contains(text(),"Brands")]/a/text()' ).extract_first() model_id = response.selector.xpath( '//div[@class="product-view-area"]//form/ul/li[contains(text(),"Model")]/text()' ).extract_first() model_id = utils.extract_text(model_id) specs = response.selector.xpath( '//div[@id="description"]/div/p').extract_first() specs = utils.extract_text(specs) price = response.selector.xpath( '//span[@class="price"]/text()').extract_first() price = utils.clean_price(price) curr_page_data = {} curr_page_data["url"] = url curr_page_data["title"] = title curr_page_data["summary"] = summary curr_page_data["catogory"] = catogory curr_page_data["brand"] = brand curr_page_data["model_id"] = model_id curr_page_data["specs"] = specs curr_page_data["price"] = price curr_page_data["vendor"] = "barclays" yield curr_page_data
def extract(): if request.method == 'POST': file = request.files['file'] basepath = os.path.dirname(__file__) file_path = os.path.join(basepath, 'uploads', file.filename) file.save(file_path) text = extract_text(file_path) if len(text) == 0: flash('No text found.','primary') else: flash(f'The extracted text is :- {text}', 'primary') return redirect(url_for('extract')) else: return render_template('extract.html')
def __init__(self, resume, skills_file=None, languages_file=None, hobbies_file=None, companies_file=None): nlp = spacy.load('en_core_web_sm') self.__skills_file = skills_file self.__languages_file = languages_file self.__hobbies_file = hobbies_file self.__companies_file = companies_file self.__matcher = Matcher(nlp.vocab) self.__details = { 'name': None, 'full_name': None, 'gender': None, 'maritial_status': None, 'passport_number': None, 'date_of_birth': None, 'email': None, 'mobile_number': None, 'skills': None, 'nationality': None, 'languages': None, 'No. of companies': None, 'hobbies': None, 'education': None, 'experience': None, 'competencies': None, 'measurable_results': None, 'no_of_pages': None, 'total_experience': None, 'address': None, 'state': None, 'city': None, 'pin': None } self.__resume = resume if not isinstance(self.__resume, io.BytesIO): ext = os.path.splitext(self.__resume)[1].split('.')[1] else: ext = self.__resume.name.split('.')[1] self.__text_raw = utils.extract_text(self.__resume, '.' + ext) self.__text = ' '.join(self.__text_raw.split()) self.__nlp = nlp(self.__text) self.__noun_chunks = list(self.__nlp.noun_chunks) self.__get_basic_details()
def __init__(self, resume): nlp = spacy.load('en_core_web_sm') self.__matcher = Matcher(nlp.vocab) self.__details = { 'name' : None, 'email' : None, 'mobile_number' : None, 'skills' : None, 'education' : None, 'experience' : None, 'competencies' : None, 'measurable_results': None } self.__resume = resume self.__text_raw = utils.extract_text(self.__resume, os.path.splitext(self.__resume)[1]) self.__text = ' '.join(self.__text_raw.split()) self.__nlp = nlp(self.__text) self.__noun_chunks = list(self.__nlp.noun_chunks) self.__get_basic_details()
def process(self): data_file_path = sys.argv[1] files = os.listdir(data_file_path) files.sort() for f in files: filename = os.path.join(data_file_path, f) logging.info(filename) count = 0 for line in open(filename, 'rb'): start = time.clock() tweet_text, tid_origin, tid_retweet, timestamp, tweet_json = utils.extract_text(line) simhash_value = Simhash(tweet_text).value if simhash_value in self.related_tweets_hash or tid_origin in self.pushed_tweets_ids or tid_retweet in self.pushed_tweets_ids: continue topic_id, similarity = self.classifier.classify(tweet_text) if topic_id == '': continue count += 1 if count % 10000 == 0: logging.info('%d' % count) tweet_json['similarity'] = similarity evaluate_score = self.ranker.predict(json.dumps(tweet_json)) total_score = (evaluate_score ** 0.5) * similarity # if total_score < 0.15: # continue timestruct = time.gmtime(int(timestamp[:-3])) is_pushed = self.pusher.push(total_score, topic_id, timestruct) if is_pushed: delivery_time = float(timestamp) / 1000.0 + (time.clock() - start) self.pushed_tweets[topic_id].append([tid_origin, str(delivery_time)[:10], similarity, total_score, tweet_text]) utc_time = time.strftime('%Y%m%d', timestruct) self.related_tweets[topic_id].append([utc_time, tid_origin, total_score, tid_retweet, timestamp[:-3], tweet_text]) self.related_tweets_hash.add(simhash_value) self.pushed_tweets_ids.add(tid_retweet) self.dump_result(f) self.pusher = Pusher() self.logger_info.info('\n=======finished!=======\n')
def tag_article(data, context): bucket = data["bucket"] name = data["name"] ext = os.path.splitext(name)[1] if len(os.path.splitext(name)[1]) > 1 else None text = None if ext in ['.tif', '.tiff', '.png', '.jpeg', '.jpg']: print("Extracting text from image file") text = utils.extract_text(bucket, name) if not text: print("Couldn't extract text from gs://%s/%s" % (bucket, name)) elif ext in ['.txt']: print("Downloading text file from cloud") storage_client = storage.Client() bucket = storage_client.bucket(bucket) blob = bucket.blob(name) text = blob.download_as_string() else: print(f'Unsupported file type {ext}') if text: tags = get_tags(text) print("Found %d tags for article %s" % (len(tags), name)) _insert_tags_bigquery(name, tags)
def __init__(self, resume): nlp = spacy.load('en_core_web_sm') self.__matcher = Matcher(nlp.vocab) self.__details = { 'name' : None, 'email' : None, 'mobile_number' : None, 'skills' : None, 'education' : None, 'experience' : None, 'no_of_pages' : None, } self.__resume = resume if not isinstance(self.__resume, io.BytesIO): ext = os.path.splitext(self.__resume)[1].split('.')[1] else: ext = self.__resume.name.split('.')[1] self.__text_raw = utils.extract_text(self.__resume, '.' + ext) self.__text = ' '.join(self.__text_raw.split()) self.__nlp = nlp(self.__text) self.__noun_chunks = list(self.__nlp.noun_chunks) self.__get_basic_details()
def process(self): start_day = time.gmtime(time.time()).tm_mday for line in sys.stdin: tweet_text, tid_origin, tid_retweet, timestamp, tweet_json = utils.extract_text( line) simhash_value = Simhash(tweet_text).value if simhash_value in self.related_tweets_hash or tid_origin in self.pushed_tweets_ids or tid_retweet in self.pushed_tweets_ids: continue topic_id, similarity = self.classifier.classify(tweet_text) if topic_id == '': continue tweet_json['similarity'] = similarity evaluate_score = self.ranker.predict(json.dumps(tweet_json)) total_score = similarity * evaluate_score if total_score < 0.15: continue is_pushed = self.pusher.push(evaluate_score, topic_id) if is_pushed: delivery_time = time.time() self.pushed_tweets[topic_id].append([ tid_origin, str(delivery_time)[:10], similarity, total_score, tweet_text ]) self.pushed_tweets_ids.add(tid_retweet) struct_time = time.gmtime(float(timestamp[:-3])) utc_time = time.strftime('%Y%m%d', struct_time) self.related_tweets[topic_id].append( [utc_time, tid_origin, total_score, tweet_text]) self.related_tweets_hash.add(simhash_value) if struct_time.tm_mday != start_day: self.dump_result(start_day) start_day = struct_time.tm_mday
def new_entry(page, text, lang): return extract_text('/src/tests/page/Paris_new_text_sv.txt')
def setUp(self): self.page = MagicMock() self.text = extract_text('/src/tests/page/Paris.txt')
def _img_payload(bucket, filename): print(f"Converting file gs://{bucket}/{filename} to text") text = utils.extract_text(bucket, filename) if not text: return None return {'text_snippet': {'content': text, 'mime_type': 'text/plain'}}
def setUp(self): self.text = extract_text('tests/page/gå_before.txt') self.page = MagicMock() self.page.title = MagicMock(return_value='gå') self.langs = ('sv', 'no', 'nb', 'nn', 'da')
def test_successfully_inserting_with_sortkey(self): self.text += "\n{{clé de tri|abcdef}}\n" result = extract_text( '/src/tests/page/Paris_original.txt') + "\n{{clé de tri|abcdef}}\n" new_text = new_entry(self.page, self.text, 'sv') assert new_text == result
def test_successfully_inserting_without_sortkey(self): new_text = new_entry(self.page, self.text, 'sv') assert new_text == extract_text('/src/tests/page/Paris_original.txt')
def setUp(self): self.text = extract_text('tests/page/manifest.txt') self.p = Parser(MagicMock)
def test_ending_sortkey(self): text = self.text + "{{clé de tri|abcdef}}\n" text = self.insert.category_sorting(self.page, text, self.langs) assert text == extract_text( '/src/tests/page/Paris_sorted.txt') + "{{clé de tri|abcdef}}\n"
def build_activity_graph(tweets, t_0): # tweets is a tweet map { tweet_id : tweet } first_tweet_ts_str = utils.ts_to_str( t_0, fmt=utils.TWITTER_TS_FORMAT) # epoch_seconds_2_timestamp_str(t_0) first_tweet_ts = utils.epoch_seconds_2_ts( t_0) #first_tweet_ts_str) # parse_twitter_ts(first_tweet_ts_str) g = nx.MultiDiGraph(post_count=len(tweets)) def add_node(g, n_id, n_type='USER', is_author=False): if n_id not in g: g.add_node(n_id, n_type=n_type, label=n_id, is_author=is_author) elif is_author: # g.nodes[n_id]['n_type'] = n_type g.nodes[n_id]['is_author'] = is_author def node_type_for(interaction): if interaction == 'HASHTAG' or interaction == 'URL': return interaction else: return 'USER' def add_edge(g, from_id, to_id, tweet_id, ts_str, int_type, **kwargs): add_node(g, from_id, 'USER', True) # g.nodes[from_id]['is_author'] = True add_node(g, to_id, n_type=node_type_for(int_type)) t = utils.extract_ts_s( ts_str ) - t_0 # timestamp_2_epoch_seconds(utils.extract_ts_s(ts_str)) - t_0 attrs = {'time_t': t, 'tweet_id': tweet_id, 'interaction': int_type} key = '%s %s %s in %s' % (from_id, int_type, to_id, tweet_id) g.add_edge(from_id, to_id, key=key, **{**attrs, **kwargs}) # Build networks # edge types: REPOST, MENTION, REPLY, QUOTE, URL, HASHTAG observed_user_ids = set() for tweet_id in tweets: tweet = tweets[tweet_id] hashtags = lowered_hashtags_from(tweet) urls = expanded_urls_from(tweet) mentions = mentioned_ids_from(tweet) tweet_text = extract_text(tweet) tweet_ts = tweet['created_at'] tweet_id = tweet['id_str'] tweeter_id = tweet['user']['id_str'] observed_user_ids.add(tweeter_id) for ht in hashtags: add_edge(g, tweeter_id, ht, tweet_id, tweet_ts, 'HASHTAG') for url in urls: if not embedded_extended_tweet_url( tweet_id, url ): # extended tweets include a URL to their extended form add_edge(g, tweeter_id, url, tweet_id, tweet_ts, 'URL') for mentioned_id in mentions: observed_user_ids.add(mentioned_id) add_edge(g, tweeter_id, mentioned_id, tweet_id, tweet_ts, 'MENTION') if 'retweeted_status' in tweet: retweeter = tweeter_id retweetee = tweet['retweeted_status']['user']['id_str'] observed_user_ids.add(retweetee) add_edge( g, retweeter, retweetee, tweet_id, tweet_ts, 'REPOST', original_tweet_id=tweet['retweeted_status']['id_str'], original_tweet_ts=tweet['retweeted_status']['created_at'], posting_delay_sec=( utils.extract_ts_s(tweet['retweeted_status']['created_at']) - utils.extract_ts_s(tweet_ts)) #.total_seconds() ) elif 'quoted_status' in tweet and 'retweeted_status' not in tweet: quoter = tweeter_id quotee = tweet['quoted_status']['user']['id_str'] observed_user_ids.add(quotee) add_edge( g, quoter, quotee, tweet_id, tweet_ts, 'QUOTE', original_tweet_id=tweet['quoted_status']['id_str'], original_tweet_ts=tweet['quoted_status']['created_at'], posting_delay_sec=( utils.extract_ts_s(tweet['quoted_status']['created_at']) - utils.extract_ts_s(tweet_ts)) #.total_seconds() ) elif 'in_reply_to_status_id_str' in tweet and tweet[ 'in_reply_to_status_id_str'] in tweets: # only consider replies that appear in the corpus # basic reply info replier = tweeter_id replied_to = tweet['in_reply_to_user_id_str'] observed_user_ids.add(replied_to) replied_to_status = tweets[tweet['in_reply_to_status_id_str']] replied_to_status_ts = replied_to_status['created_at'] posting_delay_sec = (utils.extract_ts_s(replied_to_status_ts) - utils.extract_ts_s(tweet_ts) ) #.total_seconds() add_edge(g, replier, replied_to, tweet_id, tweet_ts, 'REPLY', original_tweet_id=tweet['in_reply_to_status_id_str'], original_tweet_ts=replied_to_status_ts, posting_delay_sec=posting_delay_sec) # in conversation if tweet['in_reply_to_status_id_str'] in tweets: # follow the reply chain as far as we can conversation_root = root_of_conversation( tweet['in_reply_to_status_id_str'], tweets) # conversation_root MAY NOT be in the corpus - it's still a link though conv_root_ts = first_tweet_ts_str posting_delay_sec = (utils.ts_2_epoch_seconds(first_tweet_ts) - utils.extract_ts_s(tweet_ts) ) #.total_seconds() if conversation_root in tweets: observed_user_ids.add( tweets[conversation_root]['user']['id_str']) conv_root_ts = tweets[conversation_root]['created_at'] posting_delay_sec = (utils.extract_ts_s(conv_root_ts) - utils.extract_ts_s(tweet_ts) ) #.total_seconds() add_edge(g, replier, conversation_root, tweet_id, tweet_ts, 'IN_CONVERSATION', original_tweet_id=conversation_root, original_tweet_ts=conv_root_ts, posting_delay_sec=posting_delay_sec) return g
def setUp(self): self.page = MagicMock() self.insert = Insert(MagicMock()) self.text = extract_text('/src/tests/page/Paris.txt') self.langs = ['fr', 'nn', 'pt', 'ro', 'se', 'tl', 'tr', 'vi']
def setUp(self): self.text = extract_text('/src/tests/page/gå_before.txt')
def test_successfully_sorting(self): text_result = extract_text('/src/tests/page/Paris_sorted.txt') assert self.insert.category_sorting( self.page, self.text, self.langs) == extract_text('/src/tests/page/Paris_sorted.txt')
def test_extract_text(self): bucket = "cloud-samples-data" filename = "vision/text/screen.jpg" text = utils.extract_text(bucket, filename) self.assertIsNotNone(text) self.assertIsInstance(text, str)
def extract_report(report_path): return utils.extract_text(report_path)