Пример #1
0
    def process(self):
        start_day = time.gmtime(time.time()).tm_mday
        for line in sys.stdin:
            tweet_text, tid_origin, tid_retweet, timestamp, tweet_json = utils.extract_text(line)
            simhash_value = Simhash(tweet_text).value
            if simhash_value in self.related_tweets_hash or tid_origin in self.pushed_tweets_ids or tid_retweet in self.pushed_tweets_ids:
                continue

            topic_id, similarity = self.classifier.classify(tweet_text)
            if topic_id == '':
                continue

            tweet_json['similarity'] = similarity
            evaluate_score = self.ranker.predict(json.dumps(tweet_json))
            total_score = similarity * evaluate_score
            if total_score < 0.15:
                continue

            is_pushed = self.pusher.push(evaluate_score, topic_id)
            if is_pushed:
                delivery_time = time.time()
                self.pushed_tweets[topic_id].append([tid_origin, str(delivery_time)[:10], similarity, total_score, tweet_text])
                self.pushed_tweets_ids.add(tid_retweet)

            struct_time = time.gmtime(float(timestamp[:-3]))
            utc_time = time.strftime('%Y%m%d', struct_time)
            self.related_tweets[topic_id].append([utc_time, tid_origin, total_score, tweet_text])
            self.related_tweets_hash.add(simhash_value)

            if struct_time.tm_mday != start_day:
                self.dump_result(start_day)
                start_day = struct_time.tm_mday
Пример #2
0
 def __init__(self, resume, skills_file=None):
     self.__skills_file = skills_file
     self.__matcher = Matcher(nlp.vocab)
     self.__details = {
         'name': None,
         'email': None,
         'mobile_number': None,
         'skills': None,
         'college_name': None,
         'degree': None,
         'designation': None,
         'experience': None,
         'company_names': None,
         'no_of_pages': None,
         'total_experience': None,
     }
     self.__resume = resume
     if not isinstance(self.__resume, io.BytesIO):
         ext = os.path.splitext(self.__resume)[1].split('.')[1]
     else:
         ext = self.__resume.name.split('.')[1]
     self.__text_raw = utils.extract_text(self.__resume, '.' + ext)
     self.__text = ' '.join(self.__text_raw.split())
     self.__nlp = nlp(self.__text)
     self.__custom_nlp = custom_nlp(self.__text_raw)
     self.__noun_chunks = list(self.__nlp.noun_chunks)
     self.__get_basic_details()
Пример #3
0
def process_json_tweet(t, tweets, retweets):
    u_id = t['user']['id_str']
    urls = utils.expanded_urls_from(t)
    ot = utils.get_ot_from_rt(t)
    is_reply = utils.is_reply(t)
    t_info = {
        't_id': t['id_str'],
        'u_id': u_id,  #t['user']['id_str'],
        'u_sn': t['user']['screen_name'],
        'u_dn': t['user']['name'],
        'u_desc': t['user']['description'],
        't_ts_sec': utils.extract_ts_s(t['created_at']),
        'hashtags': utils.lowered_hashtags_from(t),
        'mentioned_ids': [m['id_str'] for m in utils.mentions_from(t)],
        'urls': urls,
        'domains': [utils.extract_domain(u, lower=True) for u in urls],
        'is_rt': ot != None,
        'retweeted_t_id': ot['id_str'] if ot else None,
        'retweeted_u_id': ot['user']['id_str'] if ot else None,
        'is_reply': is_reply,
        'replied_to_t_id':
        t['in_reply_to_status_id_str'] if is_reply else None,
        'replied_to_u_id': t['in_reply_to_user_id_str'] if is_reply else None,
        'text': utils.extract_text(t)
    }
    if u_id not in tweets: tweets[u_id] = [t_info]
    else: tweets[u_id].append(t_info)
    if t_info['is_rt'] and t_info['retweeted_t_id'] not in retweets:
        retweets[t_info['retweeted_t_id']] = {
            'user_id': t_info['retweeted_u_id'],
            'rt_text': t_info['text']
        }
Пример #4
0
    def get_details(self, resume_name):
        resume_path = "Resumes/" + resume_name
        ext = "pdf"
        nlp = spacy.load('en_core_web_sm')
        matcher = Matcher(nlp.vocab)
        text_raw = utils.extract_text(resume_path, '.' + ext)
        text = ' '.join(text_raw.split())
        array = text.split()

        topics = []
        field_list = [
            'OVERVIEW / CAREER OBJECTIVE / SUMMARY', 'KEY EXPERTISE / SKILLS',
            'EDUCATION', 'AWARDS AND SCHOLARSHIPS', 'INTERNSHIPS', 'PROJECTS',
            'ACHIEVEMENTS', 'SEMINARS / TRAININGS / WORKSHOPS',
            'CO-CURRICULAR ACTIVITIES', 'EXTRA CURRICULAR ACTIVITIES',
            'PERSONAL INTERESTS / HOBBIES', 'WEB LINKS', 'PERSONAL DETAILS'
        ]
        for word in field_list:
            if (text.find(word) >= 0):
                topics.append(word)

        content = {}
        total_topics = len(topics)
        for i in range(total_topics - 1):
            string_to_find = topics[i] + '(.*)' + topics[i + 1]
            result = re.search(string_to_find, text)
            content[topics[i]] = result.group(1)
        temp = topics[total_topics - 1] + '(.*)'
        temp_res = re.search(temp, text)
        content[topics[total_topics - 1]] = temp_res.group(1)

        __full_text = nlp(text)

        actual_marks = "CGPA: " + '(.*)' + "/ 10.00"
        cgpa = re.search(actual_marks, content['EDUCATION'])

        # DOMAIN RANKING
        rank_text = content['KEY EXPERTISE / SKILLS'] + content['PROJECTS']
        project_text = ResumeParse().clean_project(rank_text)

        file_name = "rank/" + resume_name.split('.')[0] + ".txt"
        f = open(file_name, "w+")
        f.write(project_text)
        f.close()

        #FOR SKILLS
        skills = ResumeParse().get_skills(content['KEY EXPERTISE / SKILLS'])

        # name=utils.extract_name(__full_text,matcher)
        email = utils.extract_email(text)
        mobile = utils.extract_mobile_number(text)
        details = {}
        # details['name']=name
        details['email'] = email
        details['mobile'] = mobile
        details['skills'] = skills
        details['cgpa'] = cgpa.group(1)
        return details, content
Пример #5
0
 def test_add_sortkey(self):
     t = sortkey(self.page, self.text, 'sv')
     with open('debug.txt', 'w') as f:
         f.write(t)
     assert t == extract_text('tests/page/gå_after_sortkey_sv.txt')
     # Sortkey already present
     p = extract_text('tests/page/gå_after_sortkey_sv.txt')
     t = sortkey(self.page, p, 'sv')
     assert t == p
     # Only one section
     p = extract_text('tests/page/gå_one_section_modified.txt')
     t = sortkey(self.page, p, 'sv', section='nom')
     assert t == p
     # Only danish sortkey
     p = extract_text('tests/page/gå_after_sortkey_da.txt')
     t = sortkey(self.page, self.text, 'da')
     assert t == p
     # 'da' and 'no' sortkeys only
     t = self.text
     p = extract_text('tests/page/gå_after_sortkey_da_no.txt')
     for x in ('da', 'no'):
         t = sortkey(self.page, t, x)
     assert t == p
     # 'da' and 'no' sortkeys only with non present languages
     t = self.text
     p = extract_text('tests/page/gå_after_sortkey_da_no.txt')
     for x in ('da', 'no', 'de', 'fr'):
         t = sortkey(self.page, t, x)
     assert t == p
     # All sortkeys
     p = extract_text('tests/page/gå_after_all_sortkeys.txt')
     for x in self.langs:
         self.text = sortkey(self.page, self.text, x)
     assert self.text == p
Пример #6
0
 def test_no_sortkey(self):
     # Wrong section
     t = sortkey(self.page, self.text, 'sv', section='dérivés')
     assert t == self.text
     # Page which doesn't need a sortkey
     p = extract_text('tests/page/manifest.txt')
     self.page.title = MagicMock(return_value='manifest')
     t = sortkey(self.page, p, 'sv')
     assert t == p
Пример #7
0
  def test_extract_text(self):

    class FakeSpider(object):
      test_tag_xpath = '//p[@id="id1"]'
      test_tag_xpath_mobile = '//p[@id="mobile_id1"]'

    spider = FakeSpider()
    from utils import extract_text, extract_text_null
    body = """
    <p>p1</p>
    <p id="id1">id1</p>
    <p id="id2">id2</p>
    """
    resp = TextResponse('http://example.com', body=body)
    text = extract_text(spider, 'test_tag', resp)[0]
    self.assertEqual(text, 'id1')

    text = extract_text_null(spider, 'test_tag', resp)[0]
    self.assertEqual(text, 'id1')

    body = """
    <p>p1</p>
    <p id="id1" />
    <p id="id2">id2</p>
    """
    resp = TextResponse('http://example.com', body=body)
    text = extract_text_null(spider, 'test_tag', resp)[0]
    self.assertEqual(text, None)

    settings = get_project_settings()
    for string_mobile_list in settings.get('HTML_MOBILE_STRING'):
      mobile_body = """
      <p>p1</p>
      <p id="mobile_id1">mobile_id1</p>
      <p id="mobile_id2">mobile_id2</p>
      <p>%s</p>
      """ % (' '.join(string_mobile_list))
      resp = TextResponse('http://example.com', body=mobile_body)
      text = extract_text(spider, 'test_tag', resp)[0]
      self.assertEqual(text, 'mobile_id1')

      text = extract_text_null(spider, 'test_tag', resp)[0]
      self.assertEqual(text, 'mobile_id1')
Пример #8
0
    def parse_details_page(self, response):

        page = response.url.split("/")[-1]
        filename = 'data/processed/laptoplk/%s.json' % page

        url = response.url
        title = response.selector.xpath().extract_first()
        summary = response.selector.xpath().extract_first()
        summary = utils.extract_text(summary)

        catogory = response.selector.xpath().extract_first()
        model_id = response.selector.xpath(
            '//li/b[contains(text(),"Model")]/following-sibling::strong/text()'
        ).extract_first()
        brand = model_id.split(" ")[0]
        specs = response.selector.xpath().extract_first()
        specs = utils.extract_text(specs)
        price = response.selector.xpath(
            '//b[contains(text(),"LKR")]/text()').extract_first()
        price = utils.clean_price(price)

        curr_page_data = {}
        curr_page_data["url"] = url
        curr_page_data["title"] = title
        curr_page_data["summary"] = summary

        curr_page_data["catogory"] = catogory
        curr_page_data["brand"] = brand
        curr_page_data["model_id"] = model_id
        curr_page_data["specs"] = specs
        curr_page_data["price"] = price
        curr_page_data["vendor"] = "laptop.lk"

        with open(filename, 'w') as fp:
            json.dump(curr_page_data, fp)
        # with open(filename, 'wb') as f:
        #     details = response.xpath('//div[@class="Pro"]').extract()
        #     print details
        #     f.write(str(details))
        self.log('Saved file %s' % filename)
Пример #9
0
    def parse_details_page(self, response):

        url =  response.url
        title = response.selector.xpath('//div[@class="product-name"]/h1/text()').extract_first()
        summary = response.selector.xpath('//div[@class="short-description"]').extract_first()
        specs = response.selector.xpath('//div[@id="product_tabs_description_tabbed_contents"]').extract_first()
        price = response.selector.xpath('//div[@class="product-shop"]//span[contains(@class,"price")]/text()').extract()[1]
        price = utils.clean_price(price)
        model_id = response.selector.xpath('/html/head/meta[@name="keywords"]/@content').extract_first()
        
        # if(price is None):
        #     price = response.selector.xpath('//span[@class="special-price"]/text()').extract_first()

        url_components = url.split("/")
        page = url_components[-1][:-5]
        filename = 'data/processed/metropoliton/%s.json' % page
        catogory = url_components[-2]
        brand = url_components[-1].split("-")[0]

        specs = utils.extract_text(specs)
        summary = utils.extract_text(summary)

        curr_page_data = {}        
        curr_page_data["url"] = url
        curr_page_data["title"] = title
        curr_page_data["summary"] = summary

        curr_page_data["catogory"] = catogory
        curr_page_data["brand"] = brand
        curr_page_data["model_id"] = model_id
        curr_page_data["specs"] = specs
        curr_page_data["price"] = price
        curr_page_data["vendor"] = "metropoliton"
        
        # with open(filename, 'w') as fp:
        #     json.dump(currPageData, fp)

        yield curr_page_data
Пример #10
0
    def parse_details_page(self, response):

        url = response.url
        title = response.selector.xpath('//title/text()').extract_first()
        summary = response.selector.xpath(
            '//div[@class="product-view-area"]//form/ul').extract_first()
        summary = utils.extract_text(summary)

        catogory = response.selector.xpath(
            '//div[@class="product-view-area"]//form/ul/li[contains(text(),"Categories")]/a/text()'
        ).extract_first()
        brand = response.selector.xpath(
            '//div[@class="product-view-area"]//form/ul/li[contains(text(),"Brands")]/a/text()'
        ).extract_first()
        model_id = response.selector.xpath(
            '//div[@class="product-view-area"]//form/ul/li[contains(text(),"Model")]/text()'
        ).extract_first()
        model_id = utils.extract_text(model_id)
        specs = response.selector.xpath(
            '//div[@id="description"]/div/p').extract_first()
        specs = utils.extract_text(specs)
        price = response.selector.xpath(
            '//span[@class="price"]/text()').extract_first()
        price = utils.clean_price(price)

        curr_page_data = {}
        curr_page_data["url"] = url
        curr_page_data["title"] = title
        curr_page_data["summary"] = summary

        curr_page_data["catogory"] = catogory
        curr_page_data["brand"] = brand
        curr_page_data["model_id"] = model_id
        curr_page_data["specs"] = specs
        curr_page_data["price"] = price
        curr_page_data["vendor"] = "barclays"

        yield curr_page_data
def extract():
    if request.method == 'POST':
        file = request.files['file']
        basepath = os.path.dirname(__file__)
        file_path = os.path.join(basepath, 'uploads', file.filename)
        file.save(file_path)
        text = extract_text(file_path)
        if len(text) == 0:
            flash('No text found.','primary')
        else:
            flash(f'The extracted text is :- {text}', 'primary')
        return redirect(url_for('extract'))
    else:
        return render_template('extract.html')
Пример #12
0
 def __init__(self,
              resume,
              skills_file=None,
              languages_file=None,
              hobbies_file=None,
              companies_file=None):
     nlp = spacy.load('en_core_web_sm')
     self.__skills_file = skills_file
     self.__languages_file = languages_file
     self.__hobbies_file = hobbies_file
     self.__companies_file = companies_file
     self.__matcher = Matcher(nlp.vocab)
     self.__details = {
         'name': None,
         'full_name': None,
         'gender': None,
         'maritial_status': None,
         'passport_number': None,
         'date_of_birth': None,
         'email': None,
         'mobile_number': None,
         'skills': None,
         'nationality': None,
         'languages': None,
         'No. of companies': None,
         'hobbies': None,
         'education': None,
         'experience': None,
         'competencies': None,
         'measurable_results': None,
         'no_of_pages': None,
         'total_experience': None,
         'address': None,
         'state': None,
         'city': None,
         'pin': None
     }
     self.__resume = resume
     if not isinstance(self.__resume, io.BytesIO):
         ext = os.path.splitext(self.__resume)[1].split('.')[1]
     else:
         ext = self.__resume.name.split('.')[1]
     self.__text_raw = utils.extract_text(self.__resume, '.' + ext)
     self.__text = ' '.join(self.__text_raw.split())
     self.__nlp = nlp(self.__text)
     self.__noun_chunks = list(self.__nlp.noun_chunks)
     self.__get_basic_details()
Пример #13
0
 def __init__(self, resume):
     nlp = spacy.load('en_core_web_sm')
     self.__matcher = Matcher(nlp.vocab)
     self.__details = {
         'name'              : None,
         'email'             : None,
         'mobile_number'     : None,
         'skills'            : None,
         'education'         : None,
         'experience'        : None,
         'competencies'      : None,
         'measurable_results': None
     }
     self.__resume      = resume
     self.__text_raw    = utils.extract_text(self.__resume, os.path.splitext(self.__resume)[1])
     self.__text        = ' '.join(self.__text_raw.split())
     self.__nlp         = nlp(self.__text)
     self.__noun_chunks = list(self.__nlp.noun_chunks)
     self.__get_basic_details()
Пример #14
0
    def process(self):
        data_file_path = sys.argv[1]
        files = os.listdir(data_file_path)
        files.sort()
        for f in files:
            filename = os.path.join(data_file_path, f)
            logging.info(filename)
            count = 0
            for line in open(filename, 'rb'):
                start = time.clock()
                tweet_text, tid_origin, tid_retweet, timestamp, tweet_json = utils.extract_text(line)
                simhash_value = Simhash(tweet_text).value
                if simhash_value in self.related_tweets_hash or tid_origin in self.pushed_tweets_ids or tid_retweet in self.pushed_tweets_ids:
                    continue

                topic_id, similarity = self.classifier.classify(tweet_text)
                if topic_id == '':
                    continue

                count += 1
                if count % 10000 == 0:  logging.info('%d' % count)

                tweet_json['similarity'] = similarity
                evaluate_score = self.ranker.predict(json.dumps(tweet_json))
                total_score = (evaluate_score ** 0.5) * similarity
                # if total_score < 0.15:
                #     continue

                timestruct = time.gmtime(int(timestamp[:-3]))
                is_pushed = self.pusher.push(total_score, topic_id, timestruct)
                if is_pushed:
                    delivery_time = float(timestamp) / 1000.0 + (time.clock() - start)
                    self.pushed_tweets[topic_id].append([tid_origin, str(delivery_time)[:10], similarity, total_score, tweet_text])

                utc_time = time.strftime('%Y%m%d', timestruct)
                self.related_tweets[topic_id].append([utc_time, tid_origin, total_score, tid_retweet, timestamp[:-3], tweet_text])

                self.related_tweets_hash.add(simhash_value)
                self.pushed_tweets_ids.add(tid_retweet)
            self.dump_result(f)
            self.pusher = Pusher()
        self.logger_info.info('\n=======finished!=======\n')
Пример #15
0
def tag_article(data, context):
    bucket = data["bucket"]
    name = data["name"]
    ext = os.path.splitext(name)[1] if len(os.path.splitext(name)[1]) > 1 else None
    text = None
    if ext in ['.tif', '.tiff', '.png', '.jpeg', '.jpg']:
        print("Extracting text from image file")
        text = utils.extract_text(bucket, name)
        if not text:
            print("Couldn't extract text from gs://%s/%s" % (bucket, name))
    elif ext in ['.txt']:
        print("Downloading text file from cloud")
        storage_client = storage.Client()
        bucket = storage_client.bucket(bucket)
        blob = bucket.blob(name)
        text = blob.download_as_string()
    else:
        print(f'Unsupported file type {ext}')
    if text:
        tags = get_tags(text)
        print("Found %d tags for article %s" % (len(tags), name))
        _insert_tags_bigquery(name, tags)
Пример #16
0
 def __init__(self, resume):
     nlp = spacy.load('en_core_web_sm')
     self.__matcher = Matcher(nlp.vocab)
     self.__details = {
         'name'              : None,
         'email'             : None,
         'mobile_number'     : None,
         'skills'            : None,
         'education'         : None,
         'experience'        : None,
         'no_of_pages'       : None,
     }
     self.__resume      = resume
     if not isinstance(self.__resume, io.BytesIO):
         ext = os.path.splitext(self.__resume)[1].split('.')[1]
     else:
         ext = self.__resume.name.split('.')[1]
     self.__text_raw    = utils.extract_text(self.__resume, '.' + ext)
     self.__text        = ' '.join(self.__text_raw.split())
     self.__nlp         = nlp(self.__text)
     self.__noun_chunks = list(self.__nlp.noun_chunks)
     self.__get_basic_details()
Пример #17
0
    def process(self):
        start_day = time.gmtime(time.time()).tm_mday
        for line in sys.stdin:
            tweet_text, tid_origin, tid_retweet, timestamp, tweet_json = utils.extract_text(
                line)
            simhash_value = Simhash(tweet_text).value
            if simhash_value in self.related_tweets_hash or tid_origin in self.pushed_tweets_ids or tid_retweet in self.pushed_tweets_ids:
                continue

            topic_id, similarity = self.classifier.classify(tweet_text)
            if topic_id == '':
                continue

            tweet_json['similarity'] = similarity
            evaluate_score = self.ranker.predict(json.dumps(tweet_json))
            total_score = similarity * evaluate_score
            if total_score < 0.15:
                continue

            is_pushed = self.pusher.push(evaluate_score, topic_id)
            if is_pushed:
                delivery_time = time.time()
                self.pushed_tweets[topic_id].append([
                    tid_origin,
                    str(delivery_time)[:10], similarity, total_score,
                    tweet_text
                ])
                self.pushed_tweets_ids.add(tid_retweet)

            struct_time = time.gmtime(float(timestamp[:-3]))
            utc_time = time.strftime('%Y%m%d', struct_time)
            self.related_tweets[topic_id].append(
                [utc_time, tid_origin, total_score, tweet_text])
            self.related_tweets_hash.add(simhash_value)

            if struct_time.tm_mday != start_day:
                self.dump_result(start_day)
                start_day = struct_time.tm_mday
Пример #18
0
def new_entry(page, text, lang):
    return extract_text('/src/tests/page/Paris_new_text_sv.txt')
Пример #19
0
 def setUp(self):
     self.page = MagicMock()
     self.text = extract_text('/src/tests/page/Paris.txt')
def _img_payload(bucket, filename):
    print(f"Converting file gs://{bucket}/{filename} to text")
    text = utils.extract_text(bucket, filename)
    if not text:
        return None
    return {'text_snippet': {'content': text, 'mime_type': 'text/plain'}}
Пример #21
0
 def setUp(self):
     self.text = extract_text('tests/page/gå_before.txt')
     self.page = MagicMock()
     self.page.title = MagicMock(return_value='gå')
     self.langs = ('sv', 'no', 'nb', 'nn', 'da')
Пример #22
0
 def test_successfully_inserting_with_sortkey(self):
     self.text += "\n{{clé de tri|abcdef}}\n"
     result = extract_text(
         '/src/tests/page/Paris_original.txt') + "\n{{clé de tri|abcdef}}\n"
     new_text = new_entry(self.page, self.text, 'sv')
     assert new_text == result
Пример #23
0
 def test_successfully_inserting_without_sortkey(self):
     new_text = new_entry(self.page, self.text, 'sv')
     assert new_text == extract_text('/src/tests/page/Paris_original.txt')
Пример #24
0
 def setUp(self):
     self.text = extract_text('tests/page/manifest.txt')
     self.p = Parser(MagicMock)
Пример #25
0
 def test_ending_sortkey(self):
     text = self.text + "{{clé de tri|abcdef}}\n"
     text = self.insert.category_sorting(self.page, text, self.langs)
     assert text == extract_text(
         '/src/tests/page/Paris_sorted.txt') + "{{clé de tri|abcdef}}\n"
Пример #26
0
def build_activity_graph(tweets,
                         t_0):  # tweets is a tweet map { tweet_id : tweet }
    first_tweet_ts_str = utils.ts_to_str(
        t_0, fmt=utils.TWITTER_TS_FORMAT)  # epoch_seconds_2_timestamp_str(t_0)
    first_tweet_ts = utils.epoch_seconds_2_ts(
        t_0)  #first_tweet_ts_str)  # parse_twitter_ts(first_tweet_ts_str)
    g = nx.MultiDiGraph(post_count=len(tweets))

    def add_node(g, n_id, n_type='USER', is_author=False):
        if n_id not in g:
            g.add_node(n_id, n_type=n_type, label=n_id, is_author=is_author)
        elif is_author:
            # g.nodes[n_id]['n_type'] = n_type
            g.nodes[n_id]['is_author'] = is_author

    def node_type_for(interaction):
        if interaction == 'HASHTAG' or interaction == 'URL':
            return interaction
        else:
            return 'USER'

    def add_edge(g, from_id, to_id, tweet_id, ts_str, int_type, **kwargs):
        add_node(g, from_id, 'USER', True)
        # g.nodes[from_id]['is_author'] = True
        add_node(g, to_id, n_type=node_type_for(int_type))
        t = utils.extract_ts_s(
            ts_str
        ) - t_0  # timestamp_2_epoch_seconds(utils.extract_ts_s(ts_str)) - t_0
        attrs = {'time_t': t, 'tweet_id': tweet_id, 'interaction': int_type}
        key = '%s %s %s in %s' % (from_id, int_type, to_id, tweet_id)
        g.add_edge(from_id, to_id, key=key, **{**attrs, **kwargs})

        # Build networks

    # edge types: REPOST, MENTION, REPLY, QUOTE, URL, HASHTAG
    observed_user_ids = set()
    for tweet_id in tweets:
        tweet = tweets[tweet_id]
        hashtags = lowered_hashtags_from(tweet)
        urls = expanded_urls_from(tweet)
        mentions = mentioned_ids_from(tweet)
        tweet_text = extract_text(tweet)
        tweet_ts = tweet['created_at']
        tweet_id = tweet['id_str']
        tweeter_id = tweet['user']['id_str']
        observed_user_ids.add(tweeter_id)

        for ht in hashtags:
            add_edge(g, tweeter_id, ht, tweet_id, tweet_ts, 'HASHTAG')
        for url in urls:
            if not embedded_extended_tweet_url(
                    tweet_id, url
            ):  # extended tweets include a URL to their extended form
                add_edge(g, tweeter_id, url, tweet_id, tweet_ts, 'URL')
        for mentioned_id in mentions:
            observed_user_ids.add(mentioned_id)
            add_edge(g, tweeter_id, mentioned_id, tweet_id, tweet_ts,
                     'MENTION')

        if 'retweeted_status' in tweet:
            retweeter = tweeter_id
            retweetee = tweet['retweeted_status']['user']['id_str']
            observed_user_ids.add(retweetee)
            add_edge(
                g,
                retweeter,
                retweetee,
                tweet_id,
                tweet_ts,
                'REPOST',
                original_tweet_id=tweet['retweeted_status']['id_str'],
                original_tweet_ts=tweet['retweeted_status']['created_at'],
                posting_delay_sec=(
                    utils.extract_ts_s(tweet['retweeted_status']['created_at'])
                    - utils.extract_ts_s(tweet_ts))  #.total_seconds()
            )
        elif 'quoted_status' in tweet and 'retweeted_status' not in tweet:
            quoter = tweeter_id
            quotee = tweet['quoted_status']['user']['id_str']
            observed_user_ids.add(quotee)
            add_edge(
                g,
                quoter,
                quotee,
                tweet_id,
                tweet_ts,
                'QUOTE',
                original_tweet_id=tweet['quoted_status']['id_str'],
                original_tweet_ts=tweet['quoted_status']['created_at'],
                posting_delay_sec=(
                    utils.extract_ts_s(tweet['quoted_status']['created_at']) -
                    utils.extract_ts_s(tweet_ts))  #.total_seconds()
            )
        elif 'in_reply_to_status_id_str' in tweet and tweet[
                'in_reply_to_status_id_str'] in tweets:
            # only consider replies that appear in the corpus
            # basic reply info
            replier = tweeter_id
            replied_to = tweet['in_reply_to_user_id_str']
            observed_user_ids.add(replied_to)

            replied_to_status = tweets[tweet['in_reply_to_status_id_str']]
            replied_to_status_ts = replied_to_status['created_at']
            posting_delay_sec = (utils.extract_ts_s(replied_to_status_ts) -
                                 utils.extract_ts_s(tweet_ts)
                                 )  #.total_seconds()
            add_edge(g,
                     replier,
                     replied_to,
                     tweet_id,
                     tweet_ts,
                     'REPLY',
                     original_tweet_id=tweet['in_reply_to_status_id_str'],
                     original_tweet_ts=replied_to_status_ts,
                     posting_delay_sec=posting_delay_sec)
            # in conversation
            if tweet['in_reply_to_status_id_str'] in tweets:
                # follow the reply chain as far as we can
                conversation_root = root_of_conversation(
                    tweet['in_reply_to_status_id_str'], tweets)
                # conversation_root MAY NOT be in the corpus - it's still a link though
                conv_root_ts = first_tweet_ts_str
                posting_delay_sec = (utils.ts_2_epoch_seconds(first_tweet_ts) -
                                     utils.extract_ts_s(tweet_ts)
                                     )  #.total_seconds()
                if conversation_root in tweets:
                    observed_user_ids.add(
                        tweets[conversation_root]['user']['id_str'])
                    conv_root_ts = tweets[conversation_root]['created_at']
                    posting_delay_sec = (utils.extract_ts_s(conv_root_ts) -
                                         utils.extract_ts_s(tweet_ts)
                                         )  #.total_seconds()
                add_edge(g,
                         replier,
                         conversation_root,
                         tweet_id,
                         tweet_ts,
                         'IN_CONVERSATION',
                         original_tweet_id=conversation_root,
                         original_tweet_ts=conv_root_ts,
                         posting_delay_sec=posting_delay_sec)
    return g
Пример #27
0
 def setUp(self):
     self.page = MagicMock()
     self.insert = Insert(MagicMock())
     self.text = extract_text('/src/tests/page/Paris.txt')
     self.langs = ['fr', 'nn', 'pt', 'ro', 'se', 'tl', 'tr', 'vi']
Пример #28
0
 def setUp(self):
     self.text = extract_text('/src/tests/page/gå_before.txt')
Пример #29
0
 def test_successfully_sorting(self):
     text_result = extract_text('/src/tests/page/Paris_sorted.txt')
     assert self.insert.category_sorting(
         self.page, self.text,
         self.langs) == extract_text('/src/tests/page/Paris_sorted.txt')
Пример #30
0
 def test_extract_text(self):
     bucket = "cloud-samples-data" 
     filename = "vision/text/screen.jpg"
     text = utils.extract_text(bucket, filename)
     self.assertIsNotNone(text)
     self.assertIsInstance(text, str)
Пример #31
0
def extract_report(report_path):
  return utils.extract_text(report_path)