def main(): #Reading raw data from website url = "http://www.facet.unt.edu.ar/facetinforma/category/becas/" try: data = urlopen(url) text = data.read().decode('utf-8') except: return title = text[text.find("entry-title") + 13:] title = title[title.find(">") + 1:title.find("</a>")] #Getting ISO 8601 format date text = text[text.find("datetime"):text.find("</time")] text = text[text.find("\"") + 1:text.find("\">")] #Datetime object date = dateParser(text) #Checks last date. If it's the first time sets the last date try: f = open("last", "r") except FileNotFoundError: f = open("last", "w") f.write(text) f.close() f = open("last", "r") lastdate = dateParser(f.read()) if date > lastdate: subprocess.call(['notify-send', "Facet Informa", title]) f = open("last", "w") f.write(text) f.close()
def getDaterange(input_daterange): # print ("input_daterange: %s" % input_daterange) strStartDate, strEndDate = input_daterange.split(" - ") intCompensationSec=float(32400) # print ("strStartDate: %s, strEndDate:%s" % (strStartDate,strEndDate)) # startDate=datetime.fromtimestamp(float(dateParser(strStartDate).strftime('%s'))+intCompensationSec) startDate=datetime.fromtimestamp(float(dateParser(strStartDate).strftime('%s'))) # endDate =datetime.fromtimestamp(float(dateParser(strEndDate ).strftime('%s'))+intCompensationSec) endDate =datetime.fromtimestamp(float(dateParser(strEndDate ).strftime('%s'))) # print ("startDate:%s, endDate: %s" %(startDate, endDate)) return startDate, endDate
def parseArxiv(self, doc_id): self.title = self.metadata["title"] self.title_vector = bc.encode([self.title])[0].tolist() self.pdf_file = doc_id self.parent_item = False self.tags = self.metadata['tags'] self.url = self.metadata["arxiv_url"] if self.metadata["published"]: self.year = dateParser(self.metadata["published"]).year else: self.year = False self.citations = False self.versions = False self.clusterID = False self.citations_list = False self.notes = False authors_paper = self.metadata["authors"] self.authors = [] for each in authors_paper: self.authors.append({ "firstName": each.split(' ')[0], "lastName": ' '.join(each.split(' ')[1:]) }) self.abstract = self.metadata['summary'] self.type = False # pdb.set_trace() self.globalID = doc_id # self.conference = self.metadata["publicationTitle"] self.organization = self.metadata['affiliation'] self.pages = False self.citationArticles = False
def parseMetadata( self ): #I have 2 parent items and 2 notes, 2 tags I have to put them together in the near future only reading x now self.title = self.metadata["name"] self.title_vector = bc.encode([self.title])[0].tolist() self.pdf_file = self.metadata['pdf_file'] self.parent_item = self.metadata['parentItem_x'] self.tags = self.metadata['tags_x'] self.url = self.metadata["url"] if self.metadata["date"]: self.year = dateParser(self.metadata["date"]).year else: self.year = False self.citations = False self.versions = False self.clusterID = False self.citations_list = False self.notes = self.metadata["note_x"] self.authors = self.metadata["creators"] self.abstract = False self.type = self.metadata["itemType"] # pdb.set_trace() self.globalID = self.metadata['key'] # self.conference = self.metadata["publicationTitle"] self.organization = False self.pages = False self.citationArticles = False
def main(): feed_url = 'http://alerts.weather.gov/cap/wwaatmget.php?x=%s' % (loc) doc = objectify.parse(feed_url) tree = doc.getroot() # print(objectify.dump(tree)) for e in tree.entry: # print e.title # print e.summary print 'What: %s' % e['{urn:oasis:names:tc:emergency:cap:1.1}event'] print ' Where: %s' % e['{urn:oasis:names:tc:emergency:cap:1.1}areaDesc'].text expires = dateParser(e['{urn:oasis:names:tc:emergency:cap:1.1}expires'].text) print ' Ends: %s' % expires.strftime('%A, %B %d, %Y, %I:%M %p')
def parseMetadata(self): #I have 2 parent items and 2 notes, 2 tags I have to put them together in the near future only reading x now self.title = self.metadata["name"] self.pdf_file = self.metadata['pdf_file'] self.parent_item = self.metadata['parentItem_x'] self.tags = self.metadata['tags_x'] self.url = self.metadata["url"] self.year = dateParser(self.metadata["date"]).year self.citations = False self.versions = False self.clusterID = False self.citations_list = False self.notes = self.metadata["note_x"] zotero_authors = self.metadata["creators"] self.abstract = False self.type = self.metadata["itemType"] self.globalID = self.metadata['key'] self.scholarID = False # self.conference = self.metadata["publicationTitle"] self.organization = False self.pages = False self.citationArticles = False self.authors = pd.DataFrame() # ##We create the Authors here too for eachAuthor in zotero_authors: author = eachAuthor['firstName'] + " " + eachAuthor['lastName'] # pdb.set_trace() if self.session.searchAuthor(author) == False: # self.author.append(scholar.get_author_data(author)) author_data = scholar.fast_get_author_data(author) # Need to change arrays to strings to save as PD author_data['Interests'] = str(author_data['Interests']) author_data['Paper_Ids'] = [self.globalID] author_data_df = pd.DataFrame.from_records(author_data,index=[author_data['Author']]) self.authors = self.authors.append(author_data_df) self.session.addAuthor(author_data_df,self.globalID) else: print "Author was in Session" #Get the author from session author_frame = self.session.returnAuthor(eachAuthor) #Append to the document self.authors.append(author_frame)
def csvToMongo(csvString): Seoul=pytz.timezone('Asia/Seoul') data=csvToJson(csvString) # print data # data=json.dumps(jsonData) mongoRetval=[] # print "data", data for elem in data: # PreProcessing Rule # print "elem: %s" % elem # print type(elem) # print "+="*40 elem['Size']=int(elem['Size']) print elem['Date'] elem['Date']=elem['Date']+" 09:01:00" print elem['Date'] elem['Date']=dateParser(elem['Date']) elem['Severity']=int(elem['Severity']) if elem['Threat_Name'].lower()=="none" or elem['Threat_Name']=="": elem['Threat_Name']=None if elem.has_key("CRC64") is not True: elem['CRC64']=None try: elem.pop('Unnamed: 0') except: pass # Processing From here. Date=Seoul.localize(elem['Date']) elem.pop('Date') File={ "Name": elem['FileName'], "Type": elem['Type'], "MD5" : elem['MD5'], "CRC64":elem['CRC64'], "Size": elem['Size'] } elem.pop('FileName') elem.pop('Type') elem.pop('MD5') elem.pop('CRC64') elem.pop('Size') behaviorCount=0 if elem['BeaviorCount']: behaviorCount=elem['BeaviorCount'] elem.pop('BeaviorCount') elif elem['BehaviorCount']: behaviorCount=elem['BehaviorCount'] elem.pop('BehaviorCount') Threat={ "Severity":elem['Severity'], "Name": elem['Threat_Name'], "VM_Severity":elem['Result'], "behaviorCount":behaviorCount, } elem.pop('Severity') elem.pop('Result') Results={} for key, val in elem.items(): # result(BENIGN|MALICIOUS|SUSPICIOUS) categorization if key=='': pass else: if val=="MALICOUS": val="MALICIOUS" if val in ['not found', 'Not found', 'None', 'none', 'Clean', 'BENIGN', '', None]: result="BENIGN" reason=val else: result=val reason=val if key.find("DICA") >= 0: Engine="DICA" EngineVersion=key.replace("DICA_","") Result=result Reason=reason elif key.find("VM_Threat_Name") >= 0: Engine="MDP_VM" if val.find("/") >= 0 : EngineVersion=0 Reason=reason if reason!="None": Result="MALICIOUS" else: Result="BENIGN" Reason=None else: EngineVersion=0 Result="BENIGN" Reason=None elif key.find("AhnLab-V3") >= 0 or key.find("Threat_Name")>=0: Engine="V3" EngineVersion="AhnLab-V3" Reason=reason if result!="BENIGN": Result="MALICIOUS" else: Result=result elif key.find("Heimdal")>=0: Engine="Heimdal" EngineVersion=key Result=result Reason=reason if reason.find("/") >=0: Result=result.split("/")[0] Reason=result.split("/")[1] elif key.find("VirusTotal") >= 0: Engine="VirusTotal" if val.find("/") >= 0: EngineVersion=int(reason.split("/")[1]) Reason=int(reason.split("/")[0]) if int(reason.split("/")[0])>0: Result="MALICIOUS" else: Result="BENIGN" else: EngineVersion=0 Result="BENIGN" Reason=0 else: Engine=key EngineVersion=key Result=result Reason=reason # print "Engine: %s" % Engine Results.update({ Engine: { "Version":EngineVersion, "Result":Result, "Reason":Reason, } }) # print Results # elem.pop(key) retval={ "Date": Date, "File": File, "Threat": Threat, "Results": Results } # ret.append(retval) # print "Insert: %s" % retval mongoRetval.append(col_enginediff.insert(retval)) return mongoRetval
headline = keywords body_text = body_text.replace('<</td>', '</td>') body_text = body_text.replace(';', '</td><td>') body_text = body_text.replace('<td>Name Ticker High Low Last Change Change Ratio</td>', '<th>Name</th><th>Ticker</th><th>High</th><th>Low</th><th>Last</th><th>Change</th><th>Change Ratio</th>') time_stamp_string = re.search(r'\d{12}', body_text) if time_stamp_string: stock_info_datetime = datetime.datetime.strptime(time_stamp_string.group(0), '%Y%m%d%H%M') # Adjust time from Eastern time zone. stock_info_datetime = stock_info_datetime - datetime.timedelta(hours=3) # pretty_stock_info_datetime = stock_info_datetime.strftime('%A, %B %d, %Y, %I:%M %p') pretty_stock_info_datetime = date(stock_info_datetime, 'P, N j, Y') body_text = body_text.replace(time_stamp_string.group(0), pretty_stock_info_datetime) APStory_instance = APStory( # category = ap_cat, updated = dateParser(e.updated.text), published = dateParser(e.published.text), management_id = management_id, consumer_ready = consumer_ready, media_type = e['{http://ap.org/schemas/03/2005/apcm}ContentMetadata'].MediaType.text, priority_numeric = e['{http://ap.org/schemas/03/2005/apcm}ContentMetadata'].Priority.attrib['Numeric'], priority_legacy = e['{http://ap.org/schemas/03/2005/apcm}ContentMetadata'].Priority.attrib['Legacy'], subject_code = ap_subject_code, location = location, contributor = contributor, contributor_uri = contributor_uri, byline = byline, byline_title = byline_title, slugline = e['{http://ap.org/schemas/03/2005/apcm}ContentMetadata'].SlugLine.text, title = e.title.text, keywords = keywords,
def _createGameFromTag(self, game_tag, base_url, rootElement): titleTag = game_tag.find('GameTitle') idTag = game_tag.find('id') platformTag = game_tag.find('Platform') platformIDTag = game_tag.find('PlatformId') imagesTag = game_tag.find('Images') genresTag = game_tag.find('Genres') overview = game_tag.find('Overview') release_date = game_tag.find('ReleaseDate') trailer = game_tag.find('Youtube') if titleTag is None or idTag is None or platformTag is None or platformIDTag is None: log("Not enough info to create game") return None """if overview != None: g.overview = overview.text if trailer != None: g.trailer = trailer.text """ g = Element() g.type = 'Game' g.mediaType = MediaType.get(MediaType.identifier == 'de.lad1337.games') g.setField('id', int(idTag.text), self.tag) g.setField('name', titleTag.text, self.tag) g.setField( 'front_image', self._boxartUrl(imagesTag, platformIDTag.text, base_url, 'front'), self.tag) g.setField('fanart_image', self._fanartUrl(imagesTag, base_url, 'original'), self.tag) g.setField('genre', self._genresStr(genresTag), self.tag) if release_date is not None: try: g.setField( 'release_date', datetime.datetime.strptime(release_date.text, "%m/%d/%Y"), self.tag) except ValueError: ddd = None if release_date is not None: ddd = dateParser(release_date.text) if ddd is not None and hasattr(ddd, 'year') and hasattr( ddd, 'month') and hasattr(ddd, 'day'): g.setField('release_date', datetime.datetime(ddd.year, ddd.month, ddd.day), self.tag) else: g.setField('release_date', datetime.datetime.now(), self.tag) else: g.setField('release_date', datetime.datetime.now(), self.tag) if trailer is not None: # http://stackoverflow.com/questions/2639582/python-small-regex-problem yid = re.search(r'(?<=\?v\=)[\w-]+', trailer.text) # the games db uses youtube urls g.setField('trailer', yid.group(0), self.tag) else: g.setField('trailer', '', self.tag) if int(platformIDTag.text) not in self._pCache: q = Element.select().where( Element.mediaType == rootElement.mediaType, Element.type == 'Platform') for e in q: if e.getField('id', self.tag) == int(platformIDTag.text): platform = e.copy() platform.parent = rootElement platform.saveTemp() self._pCache[int(platformIDTag.text)] = platform g.parent = platform g.saveTemp() self.progress.addItem() break else: return None else: g.parent = self._pCache[int(platformIDTag.text)] g.saveTemp() self.progress.addItem() return g
def _createGameFromTag(self, game_tag, base_url, rootElement): titleTag = game_tag.find('GameTitle') idTag = game_tag.find('id') platformTag = game_tag.find('Platform') platformIDTag = game_tag.find('PlatformId') imagesTag = game_tag.find('Images') genresTag = game_tag.find('Genres') overview = game_tag.find('Overview') release_date = game_tag.find('ReleaseDate') trailer = game_tag.find('Youtube') if titleTag is None or idTag is None or platformTag is None or platformIDTag is None: log("Not enough info to create game") return None """if overview != None: g.overview = overview.text if trailer != None: g.trailer = trailer.text """ g = Element() g.type = 'Game' g.mediaType = MediaType.get(MediaType.identifier == 'de.lad1337.games') g.setField('id', int(idTag.text), self.tag) g.setField('name', titleTag.text, self.tag) g.setField('front_image', self._boxartUrl(imagesTag, platformIDTag.text, base_url, 'front'), self.tag) g.setField('fanart_image', self._fanartUrl(imagesTag, base_url, 'original'), self.tag) g.setField('genre', self._genresStr(genresTag), self.tag) if release_date is not None: try: g.setField('release_date', datetime.datetime.strptime(release_date.text, "%m/%d/%Y"), self.tag) except ValueError: ddd = None if release_date is not None: ddd = dateParser(release_date.text) if ddd is not None and hasattr(ddd, 'year') and hasattr(ddd, 'month') and hasattr(ddd, 'day'): g.setField('release_date', datetime.datetime(ddd.year, ddd.month, ddd.day), self.tag) else: g.setField('release_date', datetime.datetime.now(), self.tag) else: g.setField('release_date', datetime.datetime.now(), self.tag) if trailer is not None: # http://stackoverflow.com/questions/2639582/python-small-regex-problem yid = re.search(r'(?<=\?v\=)[\w-]+', trailer.text) # the games db uses youtube urls g.setField('trailer', yid.group(0), self.tag) else: g.setField('trailer', '', self.tag) if int(platformIDTag.text) not in self._pCache: q = Element.select().where(Element.mediaType == rootElement.mediaType, Element.type == 'Platform') for e in q: if e.getField('id', self.tag) == int(platformIDTag.text): platform = e.copy() platform.parent = rootElement platform.saveTemp() self._pCache[int(platformIDTag.text)] = platform g.parent = platform g.saveTemp() self.progress.addItem() break else: return None else: g.parent = self._pCache[int(platformIDTag.text)] g.saveTemp() self.progress.addItem() return g
def _createGameFromTag(self, game_tag, base_url, rootElement): titleTag = game_tag.find("GameTitle") idTag = game_tag.find("id") platformTag = game_tag.find("Platform") platformIDTag = game_tag.find("PlatformId") imagesTag = game_tag.find("Images") genresTag = game_tag.find("Genres") overview = game_tag.find("Overview") release_date = game_tag.find("ReleaseDate") trailer = game_tag.find("Youtube") if titleTag is None or idTag is None or platformTag is None or platformIDTag is None: log("Not enough info to create game") return None """if overview != None: g.overview = overview.text if trailer != None: g.trailer = trailer.text """ g = Element() g.type = "Game" g.mediaType = MediaType.get(MediaType.identifier == "de.lad1337.games") g.setField("id", int(idTag.text), self.tag) g.setField("name", titleTag.text, self.tag) g.setField("front_image", self._boxartUrl(imagesTag, platformIDTag.text, base_url, "front"), self.tag) g.setField("fanart_image", self._fanartUrl(imagesTag, base_url, "original"), self.tag) g.setField("genre", self._genresStr(genresTag), self.tag) if release_date is not None: try: g.setField("release_date", datetime.datetime.strptime(release_date.text, "%m/%d/%Y"), self.tag) except ValueError: ddd = None if release_date is not None: ddd = dateParser(release_date.text) if ddd is not None and hasattr(ddd, "year") and hasattr(ddd, "month") and hasattr(ddd, "day"): g.setField("release_date", datetime.datetime(ddd.year, ddd.month, ddd.day), self.tag) else: g.setField("release_date", datetime.datetime.now(), self.tag) else: g.setField("release_date", datetime.datetime.now(), self.tag) if trailer is not None: # http://stackoverflow.com/questions/2639582/python-small-regex-problem yid = re.search(r"(?<=\?v\=)[\w-]+", trailer.text) # the games db uses youtube urls g.setField("trailer", yid.group(0), self.tag) else: g.setField("trailer", "", self.tag) if int(platformIDTag.text) not in self._pCache: q = Element.select().where(Element.mediaType == rootElement.mediaType, Element.type == "Platform") for e in q: if e.getField("id", self.tag) == int(platformIDTag.text): platform = e.copy() platform.parent = rootElement platform.saveTemp() self._pCache[int(platformIDTag.text)] = platform g.parent = platform g.saveTemp() self.progress.addItem() break else: return None else: g.parent = self._pCache[int(platformIDTag.text)] g.saveTemp() self.progress.addItem() return g