def _parseFile(filepath): #this should only happen when we process the first file ever if filepath is None: return set(), None, None data = set() info = datafiles.read_info_file(filepath) if info['date_last_absent'] is None: timestamp = util.convert_date_to_millis(info['date_modified']) else: timestamp = util.convert_date_to_millis(info['date_first_present']) csvfile = open(filepath) dialect = csv.Sniffer().sniff(csvfile.read(1024)) csvfile.seek(0) reader = csv.DictReader(csvfile, dialect=dialect) for row in reader: secid = database.getSecidFromCsid(row['GVKEY'], row['IID'], timestamp) if secid is None: secid = database.createNewCsid(row['GVKEY'], row['IID'], timestamp, None, None, True) util.warning("Created new secid: {}.{}=>{}".format(row['GVKEY'], row['IID'], secid)) data.add((secid, int(row["SPLITDATE"]), float(row["SPLITRATE"]))) #get the file start date from the filename startDate = os.path.normpath(filepath).split("/")[-1][0:8] #split the filepath startDate = int(startDate) return data, startDate, timestamp
def __getDeltas(filepath, source, treatFutureData=False): ######get the previously processed file########## localDir = config.load_source_config(source)["local_dir"] previousData = set() previousFileName = database.getLastProcessedFile(source) if previousFileName is not None: previousFileName = os.environ[ "DATA_DIR"] + "/" + localDir + "/" + previousFileName previousFileInfo = datafiles.read_info_file(previousFileName) previousFileDate = previousFileInfo["date_first_present"] firstLoad = False zf = zipfile.ZipFile(previousFileName) names = zf.namelist() assert len(names) == 1 file = zf.open(names[0]) #skip header file.readline() for line in file: if treatFutureData: effectiveDate = line.strip().split("|")[4] effectiveDate = dateutil.parser.parse(effectiveDate + " 00:00:00.000000 UTC") if effectiveDate < previousFileDate: previousData.add(line) else: previousData.add(line) file.close() zf.close() else: firstLoad = True ##########get deltas from previous file############# currentData = set() zf = zipfile.ZipFile(filepath) names = zf.namelist() assert len(names) == 1 file = zf.open(names[0]) #skip header file.readline() for line in file: currentData.add(line) file.close() zf.close() newData = currentData - previousData removedData = previousData - currentData return (newData, removedData, firstLoad)
def process(filepath, source): info = datafiles.read_info_file(filepath) if info["date_last_absent"] is not None: backfill = 0 timestamp = util.convert_date_to_millis(info["date_first_present"]) else: backfill = 1 timestamp = util.convert_date_to_millis(info["date_modified"]) database.setAttributeAutoCreate(True) bad = 0 data = util.csvdict(open(filepath)) for row in data: ticker = row["Symbol"] secid = database.getSecidFromXref("TIC", ticker, timestamp, "compustat_idhist", newdb.xrefsolve.preferUS) if secid is None: continue try: date = util.convert_date_to_millis(row["Record_Date"]) except: util.warning("Bad date for row: " + str(row)) bad += 1 if bad > 20: util.error( str(bad) + " bad lines found. Raising excpeption. Go check file " + filepath) raise Exception( str(bad) + " bad lines found. Raising excpeption. Go check file " + filepath) for sqAtt, ourAtt in attributeMap.iteritems(): name = ourAtt[0] compareWithRecent = ourAtt[1] value = row[sqAtt] if value == '': value = None database.insertAttribute("sec", "n", secid, date, source, name, value, timestamp, None, backfill, False, compareWithRecent, approximatelyEqual)
def process(filepath, source): date = os.path.basename(filepath).split('.')[2] born = date + " 09:30 EST" date_millis = util.convert_date_to_millis(date) born_millis = util.convert_date_to_millis(born) # If we have acquisition times, use these for real born_millis time info = datafiles.read_info_file(filepath) if info['date_last_absent'] is not None: born = util.convert_date_to_millis(info['date_first_present']) backfill = 0 else: born = util.convert_date_to_millis(date + " 09:30 EST") backfill = 1 database.setAttributeAutoCreate(True) for line in file(filepath): handle_htb(line, date_millis, born_millis, backfill)
def process(filepath, source): sourceNameInDatabase = "onlineinvestor" info = datafiles.read_info_file(filepath) if "hist" in source: backfill = 1 #timestamp will be data dependent else: backfill = 0 timestamp = util.convert_date_to_millis(info["date_modified"]) database.setAttributeAutoCreate(True) with open(filepath, "r") as file: for line in file: tokens = line.split("\t") date = util.convert_date_to_millis(tokens[0]) ticker = tokens[1] notes = tokens[2] if backfill == 1: born = date else: born = timestamp secid = database.getSecidFromXref("TIC", ticker, date, "compustat", newdb.xrefsolve.preferUS) if secid is None: util.warning("Failed to map ticker {},{}".format( ticker, tokens[0])) return coid, issueid = database.getCsidFromSecid(secid) assert coid is not None database.insertAttribute("co", "s", coid, date, sourceNameInDatabase, "BUYBACK", notes, born, None, backfill)
# Load set of seen files util.info("Fetching processed files for %s" % source) seen = database.getProcessedFiles(source) util.info("Intersecting...") for row in listing: util.debug("Looking at info: %s" % row[0]) file_path_info = row[0] file_path = os.path.normpath(file_path_info[0:-5]) #file_path_rel = file_path.replace("%s/%s/" % (os.environ["DATA_DIR"], sconfig['local_dir']), "") file_path_rel = os.path.relpath( file_path, "/".join( (os.environ["DATA_DIR"], sconfig["local_dir"]))) if file_path_rel not in seen: info = datafiles.read_info_file(file_path) # If we don't have reliable acquisition times (first fetch), use modified timestamp if info['date_last_absent'] is None: date_released = info['date_modified'] else: date_released = info['date_first_present'] #if we are processing using lag, do not add file if options.lag is not None and ( util.now() - util.convert_date_to_millis( datetime.timedelta(days=options.lag)) < util.convert_date_to_millis(date_released)): continue util.info("Found new file:< %s" % file_path) files.append({
if len(subdirs) == 0: errors.append("{}: Never received a file".format( sourceConfigFile[:-3])) continue subdir = subdirs[-1] acquireTimestamp = 0L for node in os.walk(sourceLocalDir + "/" + subdir): dir = node[0] files = node[2] for file in files: if ".info" in file or ".time" in file or ".new" in file: continue info = datafiles.read_info_file(dir + "/" + file) timestamp = util.convert_date_to_millis( info["date_first_present"]) if timestamp > acquireTimestamp: acquireTimestamp = timestamp now = util.now() checkTimestamp = util.convert_date_to_millis( cPickle.load(open(timeFile, 'rb'))) #get the frequency with which we expect new data expectedNewDataFrequency = sc.get("new_data_frequency", defaultNewDataFrequency) checkHours = (now - checkTimestamp) / (60 * 60 * 1000) checkMins = ((now - checkTimestamp) % (60 * 60 * 1000)) / (60 * 1000) acquireHours = (now - acquireTimestamp) / (60 * 60 * 1000)
def process(filePath, source, verifyOnly=False): #process the RSK files for now if filePath.find(".RSK.") < 0: return file = open(filePath, "r") #The first 2 lines should be the pricedate and the modeldate tokens = file.readline().strip().split(":") if tokens[0] != "PriceDate": util.error("It doesn't seem like a barra daily format") raise Exception else: priceDate = __barraDateToCompact(tokens[1].strip()) tokens = file.readline().strip().split(":") if tokens[0] != "ModelDate": util.error("It doesn't seem like a barra daily format") raise Exception else: #pass modelDate = __barraDateToCompact(tokens[1].strip()) # If we have acquisition times, use these for real born time. # Else, use the priceDate + 1 day fileInfo = datafiles.read_info_file(filePath) if fileInfo['date_last_absent'] is not None: timestamp = util.convert_date_to_millis(fileInfo['date_first_present']) backfill = 0 database.setAttributeAutoCreate(True) else: date = priceDate + datetime.timedelta(days=1) timestamp = util.convert_date_to_millis(date.strftime("%Y%m%d")) backfill = 1 database.setAttributeAutoCreate(True) #get the header names. comma separated, surrounded by double quotes line = file.readline() headers = __getListFromBarraLine(line) #init the dabase #database.dropXrefCache() #database.addXrefCache(timestamp) #cache xrefs #######MAPPING VERIFICATION CODE######## inconcistentMappings = [] ######################################## for line in file: data = __getListFromBarraLine(line) if len(data) != len(headers): util.warning("Skipping bad line: {}".format(line)) continue data = dict(zip(headers, data)) #######MAPPING VERIFICATION CODE######## if verifyOnly: result = __verifyMapping( data["BARRID"], util.cusip8to9(data["CUSIP"]), data["TICKER"], source, timestamp, newdb.xrefsolve.preferUS) #mirror the getSecid call if result is not None: inconcistentMappings.append(result) continue ######################################## secid = __getSecId(data["BARRID"], util.cusip8to9(data["CUSIP"]), data["TICKER"], source, timestamp, newdb.xrefsolve.preferUS, filePath) if secid is None: continue #Now, insert barra attributes and attribute values __removeUnwantedAttributes(data) for attributeName, attributeValue in data.iteritems(): if isinstance(attributeValue, str): table = "s" elif isinstance(attributeValue, int): table = "n" elif isinstance(attributeValue, float): table = "n" else: util.error( "Dude, attribute values should be either int,float or str") raise #assert attributeName.startswith("INDNAME") and table=="s" #With the exeption of capitalization and price, the other barra attributes #are attributes that are evaluated monthly. for them, the date should be the #model date. price we ignore, while capitatlization, we only create a new tuple #if the capitalization has changed more than a threshould since the last date #for which we have a tuple if attributeName == "PRICE": continue elif attributeName == "CAPITALIZATION": database.insertAttribute( "sec", "n", secid, util.convert_date_to_millis(priceDate), source, attributeName, attributeValue, timestamp, None, backfill, False, True, __capEquals) else: database.insertAttribute( "sec", table, secid, util.convert_date_to_millis(modelDate), source, attributeName, attributeValue, timestamp, None, backfill) file.close() #######MAPPING VERIFICATION CODE######## if verifyOnly: return inconcistentMappings
def process(filepath, source): #if full if "full" in source: #timestamp=util.convert_date_to_millis("18000101"); fileInfo = datafiles.read_info_file(filepath) timestamp = util.convert_date_to_millis(fileInfo['date_first_present']) backfill = 1 database.setAttributeAutoCreate(True) optimize = False else: fileInfo = datafiles.read_info_file(filepath) timestamp = util.convert_date_to_millis(fileInfo['date_first_present']) backfill = 0 database.setAttributeAutoCreate(False) optimize = True database.setAttributeAutoCreate(True) database.setCurrencyAutoCreate(True) #open the zipped file zf = zipfile.ZipFile(filepath) names = zf.namelist() assert len(names) == 1 file = zf.open(names[0]) #variables that persist through loop #presented here for clarity only table = None keyNames = None attributeNames = None numOfKeys = None if optimize: parsedLines = __optimize(file) #filter secids and coids to be processed if "_g" in source or "global" in source: __getGlobalCoids() processSecid = __globalSecidFilter processCoid = __globalCompanyFilter else: processSecid = __localSecidFilter processCoid = __localCompanyFilter #process lines counter = 0 while True: if optimize: if len(parsedLines) == 0: break line = parsedLines.pop(0) if len(line) == 3: (command, keyValues, attributeValues) = line[0], line[1], line[2] elif len(line) == 4: (table, numOfKeys, keyNames, attributeNames) = line[0], line[1], line[2], line[3] continue else: continue else: line = __getSplitLine(file) if line is None: break if line[0] in ("T", "F", "E"): continue elif line[0] in ("H"): (table, numOfKeys, keyNames, attributeNames) = __parseHeaderLine(line) continue elif line[0] in ("I,C,D,R"): (command, keyValues, attributeValues) = __parseDataLine(line, numOfKeys) else: util.error("Oh no! a K command: {}".format(line)) continue #progress counter = counter + 1 if counter % 10000 == 0: util.info("{}: Processing line {}k".format(datetime.datetime.now(), counter / 1000)) #remove keys that are replicated in attributes keys = {} keys.update(zip(keyNames, keyValues)) attributes = {} if command in ("I", "C"): for n, v in zip(attributeNames, attributeValues): if n not in keys and v != "": attributes[n] = v elif command in ("D"): for n, v in zip(attributeNames, attributeValues): if n not in keys and v == " ": attributes[n] = None elif command in ("R"): for n, v in zip(attributeNames, attributeValues): if n not in keys: attributes[n] = None if table == "security": __processSecurity(command, keys, attributes, timestamp, source, backfill, processCoid, processSecid) elif table == "sec_dprc": __processPrice(command, keys, attributes, timestamp, source, backfill, processCoid, processSecid) __processCSHOC(command, keys, attributes, timestamp, source, backfill, processCoid, processSecid) elif table == "company": __processCompany(command, keys, attributes, timestamp, source, backfill, processCoid, processSecid) elif table == "sec_divid": __processDividend(command, keys, attributes, timestamp, source, backfill, processCoid, processSecid) elif table == "sec_split": __processSplit(command, keys, attributes, timestamp, source, backfill, processCoid, processSecid) elif table == "co_industry": __processIndustry(command, keys, attributes, timestamp, source, backfill, processCoid, processSecid) elif table == "co_hgic": __processHgic(command, keys, attributes, timestamp, source, backfill, processCoid, processSecid) elif table in ("co_afnd1", "co_afnd2", "co_ifndq", "co_ifndsa", "co_ifndytd"): __processFundamental(command, keys, attributes, timestamp, source, backfill, processCoid, processSecid) elif table in ("co_idesind", 'co_adesind'): __processDesind(command, keys, attributes, timestamp, source, backfill, processCoid, processSecid) elif table in ("co_amkt", 'co_imkt'): __processMkt(command, keys, attributes, timestamp, source, backfill, processCoid, processSecid) elif table == "co_filedate": __processFiledate(command, keys, attributes, timestamp, source, backfill, processCoid, processSecid) elif table == "adsprate": __processCredit(command, keys, attributes, timestamp, source, backfill, processCoid, processSecid) elif table == "exrt_dly": __processExchange(command, keys, attributes, timestamp, source, backfill) else: continue #__processBufferedFundamentals(source, backfill, buffer) file.close() zf.close()
def _parseFile(filepath): #this should only happen when we process the first file ever if filepath is None: return {},None,None,None info = datafiles.read_info_file(filepath) if os.path.basename(filepath).startswith("yearn_archive.txt"): backfill = 1 archive = True elif info['date_last_absent'] is None: timestamp = util.convert_date_to_millis(info['date_modified']) backfill = 1 archive = False else: timestamp = util.convert_date_to_millis(info['date_first_present']) backfill = 0 archive = False file = open(filepath, "r") data={} for line in file: line = line.rstrip("\n") # Parse date # XXX all dates need to be in UTC based on exchange of stock annDate, name, ticker, value, time = line.split("\t") if time == 'Time Not Supplied': exactAnnDate = annDate + ' 00:00 UTC' elif time == 'Before Market Open': exactAnnDate = annDate + ' 08:00 EST' elif time == 'After Market Close': exactAnnDate = annDate + ' 17:00 EST' else: exactAnnDate = annDate +" "+ time.replace("ET", "EST") #annDate to millis try: exactAnnDate = util.convert_date_to_millis(exactAnnDate) except: util.warning("Failed to parse {}".format(exactAnnDate)) print "Failed to parse {}".format(exactAnnDate) continue if archive: timestamp = util.convert_date_to_millis(annDate) - util.convert_date_to_millis(datetime.timedelta(days=30)) secid = database.getSecidFromXref("TIC", ticker, timestamp, "compustat_idhist", newdb.xrefsolve.preferUS) if secid is None: util.warning("Failed to map ticker {}".format(ticker)) continue coid, issueid = database.getCsidFromSecid(secid) assert coid is not None data[(coid,exactAnnDate,backfill)]=annDate #data.append((coid,exactAnnDate,backfill,timestamp)) file.close() #get the file start date from the filename if not archive: startDate=os.path.normpath(filepath).split("/")[-1][0:8] #split the filepath, take last token and its first 8 chars else: startDate="20060101" return (data,archive,startDate,timestamp)
def __processPush(filepath, source): #get the date the data are about date = util.convert_date_to_millis(filepath[-21:-13]) fileInfo = datafiles.read_info_file(filepath) if fileInfo["date_last_absent"] is None: backfill = 1 timestamp = date else: backfill = 0 timestamp = util.convert_date_to_millis(fileInfo["date_first_present"]) file = open(filepath, "r") #make a first pass and collect the data data = [] counter = Counter() for line in file: tokens = line.strip().split(",") type = tokens[4] if type != 'R': util.error("Strange line in availability push file " + line) continue cusip = tokens[1] quantity = float(tokens[2]) rate = float(tokens[3]) data.append((cusip, quantity, rate)) counter[rate] += 1 #get the mode (most frequent) of the rates rateModes = counter.most_common(2) #assert that the most frequent rate lies at the left of the second most frequent, i.e., the mode corresponds to the "base" borrow rate assert rateModes[0][0] > rateModes[1][0] rateMode = rateModes[0][0] #insert the data failure = 0 for datum in data: cusip = datum[0] quantity = datum[1] rateDiff = datum[2] - rateMode secid = database.getSecidFromXref("CUSIP", cusip, timestamp, "compustat_idhist", newdb.xrefsolve.preferUS) if secid is None: failure += 1 util.warning("Failed to map CUSIP {}. Failure #{}".format( cusip, failure)) continue if rateDiff > 0: util.error( "Positive rate for {}: Rate={}, Mode={}, Diff={}".format( cusip, datum[2], rateMode, rateDiff)) elif rateDiff == 0: pass else: database.insertAttribute("sec", "n", secid, date, source, "BORROW_RATE_PUSHED", rateDiff, timestamp, None, backfill, False, False, util.dict_fields_eq_num_stable) database.insertAttribute("sec", "n", secid, date, source, "BORROW_AVAILABILITY", quantity, timestamp, None, backfill, False, False, util.dict_fields_eq_num_stable) file.close()
def __processRequest(filepath, source): #get the date the data are about date = util.convert_date_to_millis(filepath[-21:-13]) fileInfo = datafiles.read_info_file(filepath) if fileInfo["date_last_absent"] is None: backfill = 1 timestamp = date else: backfill = 0 timestamp = util.convert_date_to_millis(fileInfo["date_first_present"]) file = open(filepath, "r") #make a first pass and collect the data data = [] counter = Counter() for line in file: tokens = line.strip().split(",") ticker = tokens[0] requested = float(tokens[2]) allocated = float(tokens[3]) #notes=tokens[4] if len(tokens) > 5: rate = float(tokens[5]) type = tokens[6] else: rate = None type = None assert type is None or type == "R" data.append((ticker, requested, allocated, rate)) counter[rate] += 1 #get the mode (most frequent) of the rates rateModes = counter.most_common(2) #assert that the most frequent rate lies at the left of the second most frequent, i.e., the mode corresponds to the "base" borrow rate assert rateModes[0][0] > rateModes[1][0] rateMode = rateModes[0][0] #insert the data failure = 0 for datum in data: ticker = datum[0] requested = datum[1] allocated = datum[2] rateDiff = datum[3] - rateMode if datum[3] is not None else None secid = database.getSecidFromXref("TIC", ticker, timestamp, "compustat_idhist", newdb.xrefsolve.preferUS) if secid is None: failure += 1 util.warning("Failed to map TICKER {}. Failure #{}".format( ticker, failure)) continue if rateDiff > 0: util.error( "Positive rate for {}: Rate={}, Mode={}, Diff={}".format( ticker, datum[2], rateMode, rateDiff)) elif rateDiff == 0: pass else: database.insertAttribute("sec", "n", secid, date, source, "BORROW_RATE", rateDiff, timestamp, None, backfill, False, False, util.dict_fields_eq_num_stable) database.insertAttribute("sec", "n", secid, date, source, "BORROW_REQUESTED", requested, timestamp, None, backfill, False, False, util.dict_fields_eq_num_stable) database.insertAttribute("sec", "n", secid, date, source, "BORROW_ALLOCATED", allocated, timestamp, None, backfill, False, False, util.dict_fields_eq_num_stable) file.close()
def process(filePath, source, verifyOnly=False): #process the RSK files for now if filePath.find(".RSK.") < 0: return file = open(filePath, "r") #The first 2 lines should be the pricedate and the modeldate for daily files #For the monthly files it is just the model date #check if it is a daily file or a monthly file. Check if the first line contains PriceDate firstLine = file.readline() if "PriceDate" in firstLine: daily = True file.seek(0) #get to the first line again tokens = file.readline().strip().split(":") if tokens[0] != "PriceDate": util.error("It doesn't seem like a barra daily format") raise Exception else: priceDate = __barraDateToCompact(tokens[1].strip()) tokens = file.readline().strip().split(":") if tokens[0] != "ModelDate": util.error("It doesn't seem like a barra daily format") raise Exception else: modelDate = __barraDateToCompact(tokens[1].strip()) else: daily = False file.seek(0) #get to the first line again token = file.readline().strip() priceDate = __barraDateToCompact(token) modelDate = __barraDateToCompact(token) # If we have acquisition times, use these for real born time. # Else, use the priceDate + 1 day fileInfo = datafiles.read_info_file(filePath) if fileInfo['date_last_absent'] is not None: timestamp = util.convert_date_to_millis(fileInfo['date_first_present']) backfill = 0 else: if daily: date = priceDate + datetime.timedelta(days=1) else: date = priceDate + datetime.timedelta(days=2) timestamp = util.convert_date_to_millis(date.strftime("%Y%m%d")) backfill = 1 database.setAttributeAutoCreate(True) priceDate = util.convert_date_to_millis(priceDate) modelDate = util.convert_date_to_millis(modelDate) #get the header names. comma separated, surrounded by double quotes line = file.readline() headers = __getListFromBarraLine(line) for line in file: data = __getListFromBarraLine(line) if len(data) != len(headers): util.warning("Skipping bad line: {}".format(line)) continue data = dict(zip(headers, data)) barraid = data["BARRID"] cusip = util.cusip8to9(data["CUSIP"]) #updateBarraRef(barraid, cusip, timestamp, False) updateBarraRef(source, barraid, cusip, priceDate, True) #Now, insert barra attributes and attribute values __removeUnwantedAttributes(data) for attributeName, attributeValue in data.iteritems(): if isinstance(attributeValue, str): table = "s" elif isinstance(attributeValue, int): table = "n" elif isinstance(attributeValue, float): table = "n" else: util.error( "Dude, attribute values should be either int,float or str") raise #With the exeption of capitalization and price, the other barra attributes #are attributes that are evaluated monthly. for them, the date should be the #model date. price we ignore, while capitatlization, we only create a new tuple #if the capitalization has changed more than a threshould since the last date #for which we have a tuple if attributeName == "PRICE": continue elif attributeName == "CAPITALIZATION": insertBarraAttribute("n", barraid, priceDate, source, attributeName, attributeValue, timestamp, backfill, True, __capEquals) elif attributeName in ("TICKER", "CUSIP", "NAME"): #protect against crappy names: if attributeName == "NAME": attributeValue = __printableString(attributeValue) insertBarraAttribute("s", barraid, priceDate, source, attributeName, attributeValue, timestamp, backfill, True) else: insertBarraAttribute(table, barraid, modelDate, source, attributeName, attributeValue, timestamp, backfill) file.close()
def process(filepath, source): info = datafiles.read_info_file(filepath) born_millis = util.convert_date_to_millis(info['date_first_present']) #db.insert_checks(next=True, prev=True) database.setAttributeAutoCreate(True) f = file(filepath, 'r') for line in f.readlines(): line = line.rstrip("\n") # Parse story story = line.split("|") #secs = int(story[0][1:]) #num = int(story[1]) time = story[2][0:9] text = story[2][10:] local_date = info['date_first_present'].astimezone( pytz.timezone('US/Eastern')) date = dateutil.parser.parse(str(local_date)[0:11] + time) date_millis = util.convert_date_to_millis(date) sep = text.find(" - ") if sep == -1: sep = len(text) headline = text[0:sep] #body = text[sep+3:] category = story[3] tickers = story[4].split(";") # clean some crap out headline = headline.replace("'", "") headline = headline.replace("\"", "") headline = headline.replace(" ", " ") if category == 'Rec-Upgrade' or category == 'Rec-Downgrade': for i, ticker in enumerate(tickers): handle_news(ticker, 'FLY2', date_millis, i, born_millis) if category == 'Rec-Upgrade': value = 1 else: value = -1 handle_news(tickers[0], 'FRATING', date_millis, value, born_millis) if len(tickers) > 1 and re.match(".+ (not|Not|NOT) .+", headline) != None: handle_news(tickers[1], 'FRATING', date_millis, -1 * value, born_millis) elif category == "Rec-Initiate": for i, ticker in enumerate(tickers): handle_news(ticker, 'FLY2', date_millis, i, born_millis) value = None if re.match( ".+ (Above Average|Accumulate|Outperform|Buy|Overweight).*", headline): value = 1 elif re.match(".+ (Below Average|Underperform|Sell|Underweight).*", headline): value = -1 elif re.match(".+ (In Line|Perform|Neutral|Hold|Equal Weight).*", headline): value = 0 if value is not None: handle_news(tickers[0], 'FRATING', date_millis, value, born_millis) else: util.warning('unmatched rec initiate') util.warning(headline + " " + str(tickers)) elif category == 'Rumors': for i, ticker in enumerate(tickers): handle_news(ticker, 'FLY1', date_millis, i, born_millis) handle_news(ticker, 'FLY2', date_millis, i, born_millis) value = None if re.match( ".+ (climbs|rallies|jumps|moving higher|raises on|shares rise|movers higher|moves higher|moves off lows|moves up|shares active|ticks up|ticks higher|strength attributed to|up on|trades higher|trades up|spikes higher|moves to positive territory|spikes|begins to move higher|lifts|continues to rise|moves positive).*", headline): value = 1 elif re.match(".+ (weakness attributed to|moves lower|drops on).*", headline): value = -1 if value is not None: handle_news(tickers[0], 'FRUMOR', date_millis, value, born_millis) else: util.warning('unmatched rumor') util.warning(headline + " " + str(tickers)) elif category == 'Hot Stocks': for i, ticker in enumerate(tickers): handle_news(ticker, 'FLY1', date_millis, i, born_millis) handle_news(ticker, 'FLY2', date_millis, i, born_millis) value = None if re.match( ".+ (recieve[sd]?|receive[sd]?|issued) .*?(SEC|warning|subpoena|deficiency|delisting|non-?compliance).*", headline): value = -1 elif re.match( ".+ (achieves?|granted|awarded|secures?|renews?|receive[sd]?|granted|expands?|wins?|recieve[sd]?|issues?|issued|presents?|obtains?|announces?|signs?|acquires?|enters?|initiates?|completes?) .*?(rights?|discovery|discovers|awarded|partnerships?|collaborations?|enrollment|agreements?|strategic partner|alliances?|expanded|license|proposals?|permits?|trials?|authorization|availability|certifications?|favorable|data|CE mark|investments?|payments?|extensions?|milestones?|allowances?|accreditations?|(new.*? business)|(oil|gas) reserves|grants?|FDA (priority|approval)|proceeds|royalty|royalties|SPA|([Cc]learance)|waiver|commitments|positive|patents?|contracts?|projects?|deal|orders?|(in (.+)? case)|design|progress|program|assignment|option|approval|settlement|permission|promising|significantly improved|launch|regains|unsolicited offer).*", headline): value = 1 elif re.match( ".+ (to raise|raises|increases|to increase|initiates|raising|declares?|delcares?) .*dividend.*", headline): value = 1 elif re.match( ".+ (cuts|to cut|to lower|lowers|decreases|suspends|plans to suspend|lowering) .*dividend.*", headline): value = -1 elif re.match(".+ (acquires|raises|acquired) .*stake.*", headline): value = 1 elif re.match( ".+ (lowers|liquidn?ates|sell|sells|considering selling|sold) .*stake.*", headline): value = -1 elif re.match( ".+ (recall(s?|ing|ed)|discontinu(ing|ed|es?)|lays off|questions efficacy|announces salary reductions|announces (possible )?compromise|lowers guidance|conditions to worsen|sees (.+)?revenue decline|to layoff|not confident|capacity down|(plummets?|sinks?|drops?|moves? lower|falls?|tumbles?|retreats?) (.+)?(after|following|on)|(to reduce|reduced) (distribution|workforce)|reductions (have been|will be) (implemented|likely)|enters into lease termination|loses to|sales (down|trends? worsens?|decreased)|(credit|ratings?) (may get )?(downgraded|lowered)|downgrades|to (cut|eliminate) ((approximately|roughly) )?(%s )?jobs|to stop offering|pullback in demand|curtails production|not in compliance|takes action against|injunction restrains|(Nasdaq|NASDAQ) (notice|notification)|(notice|notification) from (Nasdaq|NASDAQ)|losses|damaged|misses|lawsuit|fraud|halts).*" % number, headline): value = -1 elif re.match( "(.+)?(launches new|expects increased demand|raises %s|resum(ed|es?|ing)|licenses? out|licenses technology|delivers|begins delivery|settles? (.+)?litigation|increases (.+)?distribution|raises guidance|approached by potential buyers|removed from CreditWatch|sales up|sales trends (.+)?improve|successfully|could expand|rules in favor for|expects .+ orders|confident|closer to Phase|remains on track|on track to|to manufacture|expects (.+)?to improve|expects strong cash flow|expects production to increase|reports? positive|reports? preliminary data|receives offer|expenses to decline|says .+ now available in|expands? distribution|selected by|selected for|sales increased|will improve|positioned for (.+)?recovery|performance strong|(credit|ratings?) (increased|raised|upgraded)|prepared to weather|continues to increase output|expanding capacity|order (.+)?delivered|(rises?|raises?|gains?|spikes?|advances?|rallies?|soars?|surges?|climbs?|trades? higher) (.+)?(on|following|after)|deploys|to deploy|provides|to provide|extend development|FDA approves|to recognize %s gain|buys %s shares|invests in second phase|shares rise|reaches agreement|sees growth|adds significant production).*" % (number, number, number), headline): value = 1 if value is not None: handle_news(tickers[0], 'FHOT', date_millis, value, born_millis) else: util.warning('unmatched hot stocks') util.warning(headline + " " + str(tickers)) elif category == "Recommendations": for i, ticker in enumerate(tickers): handle_news(ticker, 'FLY1', date_millis, i, born_millis) handle_news(ticker, 'FLY2', date_millis, i, born_millis) if headline.find("price target to") != -1: m = re.match( ".+ price target to (?P<target>%s) from (?P<oldtgt>%s) at .+" % (number, number), headline) if m is not None: gd = m.groupdict() target = normalize_number(gd['target']) oldtgt = normalize_number(gd['oldtgt']) if target > oldtgt: value = 1 elif target < oldtgt: value = -1 else: value = 0 handle_news(tickers[0], 'FREC', date_millis, value, born_millis) else: value = None if re.match( ".+ (raises price target|is a good deal|underappreciated|likely (to )?be approved|should (grow|move higher)|momentum is continuing|weakness is an overreaction|move is positive|reported (solid|excellent)|pursuing correct|outlook remains positive|will be helped|[cs]hould be better than expected|valuation compelling|has been (very )?positive|should stay strong|are top ideas|checks indicate healthy|should benefit|recommends a long|fundamentals still solid|well-positioned to (outperform|benefit)|shares oversold|should be bought|creates (a )?buying opportunity|a (strong )?buying opportunity|highly attractive|should sell better|problem is fixable|down on misguided|sell-off is overdone|positive news|can achieve|(is|are) strong|outlook (is )?boosted|guidance (is )?(likely )?conservative|should gain|reiterated (Outperform|Buy)|should be owned|poised|be able to overcome|has (good|best) prospects|significantly undervalued|added to Top Picks|remains? undervalued|results bode well|upgraded|valuation (is )?(still )?(remains )?attractive|attractively valued|raise is likely|added to (short[- ]term )?buy list|added to .+ List|added to .+ as a buy|shares defended at|should report (strong|better|stronger|solid)|margins strong|continue to generate (strong )?growth|(shown|shows) signs of improvement|estimates raised|strategy worked|results will likely be solid|named a long|weakness a buying opportunity|risk/reward (ratio )?(is )?(attractive|positive|favorable)|upgraded|mentioned positively|target raised|supports approval|has an approvable|still approvable).*", headline): value = 1 elif re.match( ".+ (target cut|reiterated Sell|should report weak(er)?|shares likely to be weak|growth seems to be slowing|estimates (reduced|trimmed)|fundamentals do not support|(will|appears to) be hurt|should be sold|valuations? (is )?(still )?(remains )?unattractive|(should|will) encounter more competition|expectations could be aggressive|remains overvalued|indicate slowing|likely to lose|faces risk|should report (.+)?weaker|will face (.+)?slowdown|sales ((appear to be|are) )?deteriorating|downgraded|estimates lowered|removed from .+ List|removed from Top Picks|still likely to fail|likely to stimulate fear|target lowered|a Sell at|lowers estimates|removed from (short[- ]term )?buy list).*", headline): value = -1 if value is not None: handle_news(tickers[0], 'FREC', date_millis, value, born_millis) else: util.warning('unmatched recommendations') util.warning(headline + " " + str(tickers)) elif category == 'Options': for i, ticker in enumerate(tickers): handle_news(ticker, 'FLY1', date_millis, i, born_millis) handle_news(ticker, 'FLY2', date_millis, i, born_millis) value = None if re.match(".+ puts? (options )?(more )?active.*", headline): value = -1 elif re.match(".+ calls? (options )?(more )?active.*", headline): value = 1 if value is not None: handle_news(tickers[0], 'FOPTION', date_millis, value, born_millis) else: util.warning('unmatched options') util.warning(headline + " " + str(tickers)) elif category == 'Earnings': for i, ticker in enumerate(tickers): handle_news(ticker, 'FLY1', date_millis, i, born_millis) handle_news(ticker, 'FLY2', date_millis, i, born_millis) headline = headline.replace("break-even", "breakeven") headline = headline.replace("break even", "breakeven") if headline.find("consensus") != -1: m = re.match( ".+? (?P<reported>(%s(( to )|-))?%s) .*consensus.* (?P<consensus>%s)" % (number, number, number), headline) if m is not None: gd = m.groupdict() cons = normalize_number(gd['consensus']) value = None gd['reported'] = gd['reported'].replace("-", " to ") if gd['reported'].find(" to ") != -1: rvalues = gd['reported'].split(" to ") rvalues[0] = normalize_number(rvalues[0]) rvalues[1] = normalize_number(rvalues[1]) replb = min(rvalues[0], rvalues[1]) repub = max(rvalues[0], rvalues[1]) if repub < cons: value = -1 if replb > cons: value = 1 else: value = 0 else: rvalue = normalize_number(gd['reported']) if rvalue < cons: value = -1 elif rvalue > cons: value = 1 else: value = 0 handle_news(tickers[0], 'FEARN', date_millis, value, born_millis) else: if re.match( ".+ (above|will exceed|should meet or beat|at least meet) .*consensus.*", headline) is not None: handle_news(tickers[0], 'FEARN', date_millis, 1, born_millis) elif re.match( ".+ (below|not expected to meet) .*consensus.*", headline) is not None: handle_news(tickers[0], 'FEARN', date_millis, -1, born_millis) else: util.warning('unmatched consensus') util.warning(headline + " " + str(tickers)) elif category == 'Technical Analysis': pass elif category == 'Conference/Events': pass elif category == 'General news': pass elif category == 'Periodicals': pass elif category == 'Syndicate': value = None if re.match(".+ ([Tt]o [Ss]ell) .+", headline): value = -1 if value is not None: handle_news(tickers[0], 'FSYND', date_millis, value, born_millis) else: util.warning('unmatched syndicate') util.warning(headline + " " + str(tickers)) else: util.warning('unknown category') util.warning(category + " " + headline + " " + str(tickers))